Skip to content
This repository has been archived by the owner on Jan 29, 2024. It is now read-only.

Support download of PMC oa_other #596

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 11 additions & 8 deletions src/bluesearch/database/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def generate_pmc_urls(

Parameters
----------
component : {"author_manuscript", "oa_comm", "oa_noncomm"}
component : {"author_manuscript", "oa_comm", "oa_noncomm", "oa_other"}
Part of the PMC to download.
start_date
Starting date to download the incremental files.
Expand All @@ -108,17 +108,20 @@ def generate_pmc_urls(
ValueError
If the chosen component does not exist on PMC.
"""
base_url = "https://ftp.ncbi.nlm.nih.gov/pub/pmc/"
if component in {"oa_comm", "oa_noncomm"}:
base_url += f"oa_bulk/{component}/xml/"
elif component == "author_manuscript":
base_url += "manuscript/xml/"
else:
avail_components = {"author_manuscript", "oa_comm", "oa_noncomm", "oa_other"}
if component not in avail_components:
raise ValueError(
f"Unexcepted component {component}. "
"Only {'author_manuscript', 'oa_comm', 'oa_noncomm'} are supported."
f"Only {avail_components} "
"are supported."
)

base_url = "https://ftp.ncbi.nlm.nih.gov/pub/pmc/"
if component == "author_manuscript":
base_url += "manuscript/xml/"
else:
base_url += f"oa_bulk/{component}/xml/"

days_list = get_daterange_list(start_date=start_date, end_date=end_date)

url_list = []
Expand Down
3 changes: 2 additions & 1 deletion src/bluesearch/entrypoint/database/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,8 @@ def run(source: str, from_month: datetime, output_dir: Path, dry_run: bool) -> i

if article_source == ArticleSource.PMC:
url_dict = {}
for component in {"author_manuscript", "oa_comm", "oa_noncomm"}:
avail_components = ["author_manuscript", "oa_comm", "oa_noncomm", "oa_other"]
for component in avail_components:
url_dict[component] = generate_pmc_urls(component, from_month)

if dry_run:
Expand Down
1 change: 1 addition & 0 deletions tests/unit/database/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def test_delta_wrong(self):
("author_manuscript", "https://ftp.ncbi.nlm.nih.gov/pub/pmc/manuscript/xml/"),
("oa_comm", "https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/oa_comm/xml/"),
("oa_noncomm", "https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/oa_noncomm/xml/"),
("oa_other", "https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/oa_other/xml/"),
],
)
def test_generate_pmc_urls(monkeypatch, component, expected_url_start):
Expand Down
1 change: 1 addition & 0 deletions tests/unit/entrypoint/database/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def fake_download_articles_func(url_list, output_dir):
"author_manuscript",
"oa_comm",
"oa_noncomm",
"oa_other",
}
for sub_dir in pmc_path.iterdir():
assert len(list(sub_dir.iterdir())) == 2
Expand Down