Skip to content
This repository has been archived by the owner on Jan 29, 2024. It is now read-only.

Add topic to parsed json #628

Closed
wants to merge 11 commits into from
Closed
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 29 additions & 2 deletions src/bluesearch/entrypoint/database/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,14 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
Parse files recursively.
""",
)
parser.add_argument(
"-i",
"--include-topic",
action="store_true",
help="""
If True, include topic inside the parsed json.
""",
)
parser.add_argument(
"-n",
"--dry-run",
Expand Down Expand Up @@ -164,14 +172,17 @@ def run(
output_dir: Path,
match_filename: str | None,
recursive: bool,
include_topic: bool,
dry_run: bool,
) -> int:
"""Parse one or several articles.

Parameter description and potential defaults are documented inside of the
`get_parser` function.
"""
from bluesearch.utils import find_files
import json

from bluesearch.utils import JSONL, find_files

if input_path is None:
if sys.stdin.isatty():
Expand Down Expand Up @@ -211,14 +222,30 @@ def run(
try:
parsers = iter_parsers(input_type, input_path)

for parser in parsers:
for i, parser in enumerate(parsers):
article = Article.parse(parser)
output_file = output_dir / f"{article.uid}.json"

if output_file.exists():
raise FileExistsError(f"Output '{output_file}' already exists!")
else:
serialized = article.to_json()

if include_topic:
topic_path = (
input_path.parent.parent
/ "topic"
/ f"{input_path.stem}.json"
Comment on lines +235 to +237
Copy link
Contributor

@jankrepl jankrepl Sep 19, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not a huge fan of this hardcoding.

IMO we should not assume this is run within a pipeline.

Why don't we just replace --include-topic with --topic-path (or similar) and if the user does not provide it then it means that they are not interested

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes that is definitely my plan! I did not want to hardcode the variable. I just wanted to be sure we can move on in this direction. What do you think about the way to solve this issue ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So maybe I can review again once you have a final version?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Definitely! I let you know when the PR is done :) Thank you for your help!

)
topic_json = JSONL.load_jsonl(topic_path)

serialized_json = json.loads(serialized)
if input_type == "pubmed-xml-set":
serialized_json["topics"] = topic_json[i]["topics"]
Copy link
Contributor

@jankrepl jankrepl Sep 19, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the idea here is to dynamically add this topics key only inside of the JSON?

Wouldn't it make more sense to define a new attribute topics for Article that is an empty list by default? And then modify this attribute once we have the topics and then serialize the whole article?

This way the schema of the Article is going to be clear.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the idea, but I have the feeling that parsing the article and finding the topic are two completely separate steps. I am a bit afraid of making those steps more dependent on each other. Currently, I try to keep everything independent (and the changes in the CI command reasonable). Do you think it is possible to add the new attribute topics in Article and keeping things independent?

Copy link
Contributor

@jankrepl jankrepl Sep 19, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see your point and I guess that is why I suggested to set it to an emptylist[str] by default. If the user wants, they can provide it later but at least we declare that the topics attribute could be populated.

IMO one should just look at the Article dataclass and be able to see what the final schema of the saved .json is going to be. Once we start writing code that adds new entries to the .json then I am worried about things becoming intractable.

Anyway, what do you think?

else:
serialized_json["topics"] = topic_json[0]["topics"]
serialized = json.dumps(serialized_json)

output_file.write_text(serialized, "utf-8")

except Exception as e:
Expand Down
2 changes: 2 additions & 0 deletions src/bluesearch/entrypoint/database/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ def program_args(self) -> list[str]:
*BBS_BINARY,
"topic-extract",
*VERBOSITY,
"--i",
GlobalParams().source,
input_dir,
output_dir,
Expand Down Expand Up @@ -432,6 +433,7 @@ def program_args(self) -> list[str]:
command = [
*BBS_BINARY,
"parse",
"--i",
*VERBOSITY,
parser,
str(input_dir),
Expand Down
50 changes: 50 additions & 0 deletions src/bluesearch/entrypoint/database/topic_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import argparse
import gzip
import json
import logging
from pathlib import Path
from typing import Any
Expand Down Expand Up @@ -78,6 +79,14 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
expression. Ignored when 'input_path' is a path to a file.
""",
)
parser.add_argument(
"-i",
"--inc-individual-json",
action="store_true",
help="""
If True, individual json are also saved.
""",
)
parser.add_argument(
"-R",
"--recursive",
Expand Down Expand Up @@ -121,12 +130,33 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
return parser


def create_individual_json(path, topic_info):
"""Create json containing the extracted topics.

Parameters
----------
path
Path of the original article
topic_info
Topics extracted for the given article.
"""
folder = path.parent.parent / "topic"
if not folder.exists():
folder.mkdir()

new_path = folder / f"{path.stem}.json"
with new_path.open("w") as f:
line = json.dumps(topic_info.json())
f.write(line)


def run(
*,
source: str,
input_path: Path,
output_file: Path,
match_filename: str | None,
inc_individual_json: bool,
recursive: bool,
overwrite: bool,
dry_run: bool,
Expand Down Expand Up @@ -182,6 +212,10 @@ def run(
topic_info.add_journal_topics(
"MeSH", mesh.resolve_parents(journal_topics, mesh_tree)
)

if inc_individual_json:
create_individual_json(path, topic_info)

all_results.append(topic_info.json())
elif article_source is ArticleSource.PUBMED:
if mesh_topic_db is None:
Expand All @@ -194,6 +228,7 @@ def run(
logger.info(f"Processing {path}")
with gzip.open(path) as xml_stream:
articles = ElementTree.parse(xml_stream)
topics_per_file = []

for i, article in enumerate(articles.iter("PubmedArticle")):
logger.info(f"Processing element in file {i}")
Expand All @@ -217,11 +252,23 @@ def run(
"MeSH", mesh.resolve_parents(journal_topics, mesh_tree)
)
all_results.append(topic_info.json())
topics_per_file.append(topic_info.json())

folder = path.parent.parent / "topic"
if not folder.exists():
folder.mkdir()

new_path = folder / f"{path.stem}.json"
JSONL.dump_jsonl(topics_per_file, new_path)

elif article_source is ArticleSource.ARXIV:
for path, article_topics in get_topics_for_arxiv_articles(inputs).items():
topic_info = TopicInfo(source=article_source, path=path)
topic_info.add_article_topics("arXiv", article_topics)

if inc_individual_json:
create_individual_json(path, topic_info)

all_results.append(topic_info.json())
elif article_source in {ArticleSource.BIORXIV, ArticleSource.MEDRXIV}:
for path in inputs:
Expand All @@ -234,6 +281,9 @@ def run(
topic_info = TopicInfo(source=ArticleSource(journal), path=path)
topic_info.add_article_topics("Subject Area", [topic])

if inc_individual_json:
create_individual_json(path, topic_info)

all_results.append(topic_info.json())
else:
logger.error(f"The source type {source!r} is not implemented yet")
Expand Down
10 changes: 10 additions & 0 deletions tests/unit/entrypoint/database/test_topic_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
"overwrite",
"dry_run",
"mesh_topic_db",
"inc_individual_json",
}


Expand Down Expand Up @@ -68,6 +69,7 @@ def test_input_path_not_correct(caplog):
output_file=pathlib.Path(""),
match_filename=None,
recursive=False,
inc_individual_json=False,
overwrite=False,
dry_run=False,
)
Expand All @@ -84,6 +86,7 @@ def test_source_type_not_implemented(test_data_path, caplog, tmp_path):
output_file=tmp_path,
match_filename=None,
recursive=False,
inc_individual_json=False,
overwrite=False,
dry_run=False,
)
Expand All @@ -99,6 +102,7 @@ def test_dry_run(test_data_path, capsys, tmp_path):
output_file=tmp_path,
match_filename=None,
recursive=False,
inc_individual_json=False,
overwrite=False,
dry_run=True,
)
Expand Down Expand Up @@ -131,6 +135,7 @@ def test_pmc_source(test_data_path, capsys, monkeypatch, tmp_path):
output_file=output_jsonl,
match_filename=None,
recursive=False,
inc_individual_json=False,
overwrite=False,
dry_run=False,
mesh_topic_db=mesh_tree_path,
Expand All @@ -157,6 +162,7 @@ def test_pmc_source(test_data_path, capsys, monkeypatch, tmp_path):
output_file=output_jsonl,
match_filename=None,
recursive=False,
inc_individual_json=False,
overwrite=True,
dry_run=False,
mesh_topic_db=mesh_tree_path,
Expand All @@ -172,6 +178,7 @@ def test_pmc_source(test_data_path, capsys, monkeypatch, tmp_path):
output_file=output_jsonl,
match_filename=None,
recursive=False,
inc_individual_json=False,
overwrite=False,
dry_run=False,
mesh_topic_db=mesh_tree_path,
Expand Down Expand Up @@ -203,6 +210,7 @@ def test_medbiorxiv_source(capsys, monkeypatch, tmp_path, source):
output_file=output_file,
match_filename=None,
recursive=False,
inc_individual_json=False,
overwrite=False,
dry_run=False,
)
Expand Down Expand Up @@ -251,6 +259,7 @@ def test_pubmed_source(
output_file=output_jsonl,
match_filename=None,
recursive=False,
inc_individual_json=False,
overwrite=False,
dry_run=False,
mesh_topic_db=mesh_tree_path,
Expand Down Expand Up @@ -285,6 +294,7 @@ def test_mesh_topic_db_is_enforced(source, caplog, tmp_path):
output_file=tmp_path,
match_filename=None,
recursive=False,
inc_individual_json=False,
overwrite=False,
dry_run=False,
)
Expand Down
1 change: 1 addition & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ filterwarnings =
ignore::DeprecationWarning:docker.*:
ignore::DeprecationWarning:luigi.task:
ignore::DeprecationWarning:transformers.image_utils.*:
ignore::Warning:luigi.parameter.UnconsumedParameterWarning:
addopts =
--cov
--cov-config=tox.ini
Expand Down