-
Notifications
You must be signed in to change notification settings - Fork 12
Changes from 7 commits
c387ed2
bc99fb2
3f34461
f8fc635
84ede23
3479c01
5cd6ea4
b10b52f
0013844
3ac3fe5
add8a5f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -112,6 +112,14 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: | |
Parse files recursively. | ||
""", | ||
) | ||
parser.add_argument( | ||
"-i", | ||
"--include-topic", | ||
action="store_true", | ||
help=""" | ||
If True, include topic inside the parsed json. | ||
""", | ||
) | ||
parser.add_argument( | ||
"-n", | ||
"--dry-run", | ||
|
@@ -164,14 +172,17 @@ def run( | |
output_dir: Path, | ||
match_filename: str | None, | ||
recursive: bool, | ||
include_topic: bool, | ||
dry_run: bool, | ||
) -> int: | ||
"""Parse one or several articles. | ||
|
||
Parameter description and potential defaults are documented inside of the | ||
`get_parser` function. | ||
""" | ||
from bluesearch.utils import find_files | ||
import json | ||
|
||
from bluesearch.utils import JSONL, find_files | ||
|
||
if input_path is None: | ||
if sys.stdin.isatty(): | ||
|
@@ -211,14 +222,30 @@ def run( | |
try: | ||
parsers = iter_parsers(input_type, input_path) | ||
|
||
for parser in parsers: | ||
for i, parser in enumerate(parsers): | ||
article = Article.parse(parser) | ||
output_file = output_dir / f"{article.uid}.json" | ||
|
||
if output_file.exists(): | ||
raise FileExistsError(f"Output '{output_file}' already exists!") | ||
else: | ||
serialized = article.to_json() | ||
|
||
if include_topic: | ||
topic_path = ( | ||
input_path.parent.parent | ||
/ "topic" | ||
/ f"{input_path.stem}.json" | ||
) | ||
topic_json = JSONL.load_jsonl(topic_path) | ||
|
||
serialized_json = json.loads(serialized) | ||
if input_type == "pubmed-xml-set": | ||
serialized_json["topics"] = topic_json[i]["topics"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So the idea here is to dynamically add this Wouldn't it make more sense to define a new attribute This way the schema of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like the idea, but I have the feeling that parsing the article and finding the topic are two completely separate steps. I am a bit afraid of making those steps more dependent on each other. Currently, I try to keep everything independent (and the changes in the CI command reasonable). Do you think it is possible to add the new attribute There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see your point and I guess that is why I suggested to set it to an empty IMO one should just look at the Anyway, what do you think? |
||
else: | ||
serialized_json["topics"] = topic_json[0]["topics"] | ||
serialized = json.dumps(serialized_json) | ||
|
||
output_file.write_text(serialized, "utf-8") | ||
|
||
except Exception as e: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not a huge fan of this hardcoding.
IMO we should not assume this is run within a pipeline.
Why don't we just replace
--include-topic
with--topic-path
(or similar) and if the user does not provide it then it means that they are not interestedThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes that is definitely my plan! I did not want to hardcode the variable. I just wanted to be sure we can move on in this direction. What do you think about the way to solve this issue ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So maybe I can review again once you have a final version?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Definitely! I let you know when the PR is done :) Thank you for your help!