Skip to content

Commit

Permalink
refactor: deprecate gbif_endpoint_set_datetime
Browse files Browse the repository at this point in the history
Deprecate gbif_endpoint_set_datetime in favor of is_synchronized to
indicate the synchronization status of an EDI dataset with GBIF.

Is related to c9ebad3.
  • Loading branch information
clnsmth authored Sep 26, 2023
1 parent e067bae commit 2c7ea77
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 56 deletions.
9 changes: 4 additions & 5 deletions src/gbif_registrar/register.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def register(file_path, local_dataset_id=None):
"local_dataset_group_id": None,
"local_dataset_endpoint": None,
"gbif_dataset_uuid": None,
"gbif_endpoint_set_datetime": None,
"is_synchronized": None,
},
index=[0],
)
Expand All @@ -74,8 +74,7 @@ def complete_registrations(rgstrs):
Parameters
----------
rgstrs : DataFrame
Pandas dataframe with the gbif_endpoint_set_datetime column formatted as
datetime.
Pandas dataframe.
Returns
-------
Expand All @@ -84,13 +83,13 @@ def complete_registrations(rgstrs):
"""
# Get all rows where the rgstrs dataframe columns
# local_dataset_group_id, local_dataset_endpoint, gbif_dataset_uuid,
# gbif_endpoint_set_datetime contain empty values. These are the rows
# is_synchronized contain empty values. These are the rows
# that need to be completed.
record = rgstrs[
(rgstrs["local_dataset_group_id"].isnull())
| (rgstrs["local_dataset_endpoint"].isnull())
| (rgstrs["gbif_dataset_uuid"].isnull())
| (rgstrs["gbif_endpoint_set_datetime"].isnull())
| (rgstrs["is_synchronized"].isnull())
]
# If the record dataframe is empty, then there are no rows to complete.
# Return the rgstrs dataframe.
Expand Down
18 changes: 7 additions & 11 deletions src/gbif_registrar/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ def initialize_registrations(file_path):
The registrations file is a map from datasets in the local repository, to
identifiers in the remote GBIF registry. This file contains additional
information about the local datasets, as well as the most recent datetime
GBIF crawled the local endpoint to synchronize the registry instance. The
registrations file columns (and definitions):
information about the local datasets, as well as the synchronization
status of the local dataset with GBIF. The registrations file columns
(and definitions):
- `local_dataset_id`: The identifier of the dataset in the local
repository system. This is the primary key.
Expand All @@ -27,8 +27,8 @@ def initialize_registrations(file_path):
- `gbif_dataset_uuid`: The registration identifier assigned by GBIF to the
local dataset group. This has a one-to-one relationship with
`local_dataset_group_id`.
- `gbif_endpoint_set_datetime`: The datetime GBIF crawled the
`local_dataset_endpoint`.
- `is_synchronized`: The synchronization status of the local dataset with
GBIF.
Parameters
----------
Expand Down Expand Up @@ -58,17 +58,13 @@ def read_registrations(file_path):
Returns
-------
DataFrame
Pandas dataframe with the gbif_endpoint_set_datetime column formatted as
datetime.
Pandas dataframe.
See Also
--------
check_registrations_file
"""
rgstrs = pd.read_csv(file_path, delimiter=",")
rgstrs["gbif_endpoint_set_datetime"] = pd.to_datetime(
rgstrs["gbif_endpoint_set_datetime"]
)
return rgstrs


Expand All @@ -79,7 +75,7 @@ def expected_cols():
"local_dataset_group_id",
"local_dataset_endpoint",
"gbif_dataset_uuid",
"gbif_endpoint_set_datetime",
"is_synchronized",
]
return cols

Expand Down
28 changes: 14 additions & 14 deletions src/gbif_registrar/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def check_completeness(rgstrs):
"""Checks registrations for completeness.
A complete registration has values for all fields except (perhaps)
`gbif_endpoint_set_datetime`, which is not essential for initiating a GBIF
`is_synchronized`, which is not essential for initiating a GBIF
crawl.
Parameters
Expand All @@ -27,7 +27,7 @@ def check_completeness(rgstrs):
UserWarning
If any registrations are incomplete.
"""
rgstrs = rgstrs[expected_cols()].drop(["gbif_endpoint_set_datetime"], axis=1)
rgstrs = rgstrs[expected_cols()].drop(["is_synchronized"], axis=1)
rgstrs = rgstrs[rgstrs.isna().any(axis=1)]
if len(rgstrs) > 0:
rows = rgstrs.index.to_series() + 1
Expand Down Expand Up @@ -173,12 +173,13 @@ def check_local_endpoints(rgstrs):
)


def check_crawl_datetime(rgstrs):
"""Checks if registrations have been crawled.
def check_is_synchronized(rgstrs):
"""Checks if registrations have been synchronized.
Registrations contain all the information needed for GBIF to successfully
crawl the corresponding dataset and post to the GBIF data portal. Datetime
values in the `gbif_endpoint_set_datetime` indicate the dataset has been crawled.
crawl the corresponding dataset and post to the GBIF data portal. Boolean
True/False values in the `is_synchronized` field indicate the dataset is
synchronized.
Parameters
----------
Expand All @@ -197,13 +198,12 @@ def check_crawl_datetime(rgstrs):
Examples
--------
>>> rgstrs = read_registrations('tests/registrations.csv')
>>> check_crawl_datetime(rgstrs)
>>> check_is_synchronized(rgstrs)
"""
uncrawled = rgstrs["gbif_endpoint_set_datetime"].isna()
if any(uncrawled):
rows = rgstrs[uncrawled].index.to_series() + 1
rows = rows.astype("string")
warnings.warn("Uncrawled registrations in rows: " + ", ".join(rows))
if not rgstrs["is_synchronized"].all():
rows = rgstrs["is_synchronized"].index.to_series() + 1
rows = rows[~rgstrs["is_synchronized"]].astype("string")
warnings.warn("Unsynchronized registrations in rows: " + ", ".join(rows))


def check_local_dataset_id_format(rgstrs):
Expand Down Expand Up @@ -278,7 +278,7 @@ def validate_registrations(file_path, extended_checks=False):
This is a wrapper to `check_completeness`, `check_local_dataset_id`,
`check_group_registrations`, `check_local_endpoints`, and
`check_crawl_datetime`.
`check_is_synchronized`.
Parameters
----------
Expand Down Expand Up @@ -308,7 +308,7 @@ def validate_registrations(file_path, extended_checks=False):
check_local_dataset_id(rgstrs)
check_group_registrations(rgstrs)
check_local_endpoints(rgstrs)
check_crawl_datetime(rgstrs)
check_is_synchronized(rgstrs)
if extended_checks:
check_local_dataset_id_format(rgstrs)
check_local_dataset_group_id_format(rgstrs)
16 changes: 8 additions & 8 deletions tests/registrations.csv
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
local_dataset_id,local_dataset_group_id,local_dataset_endpoint,gbif_dataset_uuid,gbif_endpoint_set_datetime
edi.193.4,edi.193,https://pasta.lternet.edu/package/archive/eml/edi/193/4/archive_edi.193.4_15494687022457218,e44c5367-9d09-4328-9a5a-d0f41fb22d61,2021-04-21 14:32:00
edi.193.5,edi.193,https://pasta.lternet.edu/package/archive/eml/edi/193/5/archive_edi.193.5_18246176635637788,e44c5367-9d09-4328-9a5a-d0f41fb22d61,2022-12-05 03:45:00
edi.356.1,edi.356,https://pasta.lternet.edu/package/archive/eml/edi/356/1/archive_edi.356.1_74665239205233345,8c30c4a7-2f63-4421-83c0-60f3d3b195b1,2021-04-21 10:00:00
edi.356.2,edi.356,https://pasta.lternet.edu/package/archive/eml/edi/356/2/archive_edi.356.2_74665239205231111,8c30c4a7-2f63-4421-83c0-60f3d3b195b1,2022-12-05 02:00:00
knb-lter-msp.1.1,knb-lter-msp.1,https://pasta.lternet.edu/package/archive/eml/knb-lter-msp/1/1/archive_knb-lter-msp.1.1_33365239205233345,8c30c4a7-4444-4421-83c0-60f3d3b195b1,2022-12-21 09:00:00
knb-lter-msp.1.2,knb-lter-msp.1,https://pasta.lternet.edu/package/archive/eml/knb-lter-msp/1/2/archive_knb-lter-msp.1.2_32165239205231111,8c30c4a7-4444-4421-83c0-60f3d3b195b1,2022-12-22 10:00:00
edi.941.3,edi.941,https://pasta.lternet.edu/package/download/eml/edi/941/3,cfb3f6d5-ed7d-4fff-9f1b-f032ed1de485,2023-06-22 00:00:00
local_dataset_id,local_dataset_group_id,local_dataset_endpoint,gbif_dataset_uuid,is_synchronized
edi.193.4,edi.193,https://pasta.lternet.edu/package/archive/eml/edi/193/4/archive_edi.193.4_15494687022457218,e44c5367-9d09-4328-9a5a-d0f41fb22d61,True
edi.193.5,edi.193,https://pasta.lternet.edu/package/archive/eml/edi/193/5/archive_edi.193.5_18246176635637788,e44c5367-9d09-4328-9a5a-d0f41fb22d61,True
edi.356.1,edi.356,https://pasta.lternet.edu/package/archive/eml/edi/356/1/archive_edi.356.1_74665239205233345,8c30c4a7-2f63-4421-83c0-60f3d3b195b1,True
edi.356.2,edi.356,https://pasta.lternet.edu/package/archive/eml/edi/356/2/archive_edi.356.2_74665239205231111,8c30c4a7-2f63-4421-83c0-60f3d3b195b1,True
knb-lter-msp.1.1,knb-lter-msp.1,https://pasta.lternet.edu/package/archive/eml/knb-lter-msp/1/1/archive_knb-lter-msp.1.1_33365239205233345,8c30c4a7-4444-4421-83c0-60f3d3b195b1,True
knb-lter-msp.1.2,knb-lter-msp.1,https://pasta.lternet.edu/package/archive/eml/knb-lter-msp/1/2/archive_knb-lter-msp.1.2_32165239205231111,8c30c4a7-4444-4421-83c0-60f3d3b195b1,True
edi.941.3,edi.941,https://pasta.lternet.edu/package/download/eml/edi/941/3,cfb3f6d5-ed7d-4fff-9f1b-f032ed1de485,True
7 changes: 4 additions & 3 deletions tests/test_register.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def test_get_gbif_dataset_uuid_does_not_exist(rgstrs, mocker):
"local_dataset_endpoint"
] = "https://pasta.lternet.edu/package/download/eml/edi/929/1"
new_row["gbif_dataset_uuid"] = None
new_row["gbif_endpoint_set_datetime"] = None
new_row["is_synchronized"] = False
rgstrs = rgstrs.append(new_row, ignore_index=True)
# Run the get_gbif_dataset_uuid function and check that it returns the new
# GBIF dataset UUID.
Expand Down Expand Up @@ -147,8 +147,9 @@ def test_register_repairs_failed_registration(
assert rgstrs_final.shape[0] == rgstrs_initial.shape[0]
assert rgstrs_final.iloc[-1]["local_dataset_id"] == local_dataset_id
assert rgstrs_final.iloc[-1]["gbif_dataset_uuid"] == gbif_dataset_uuid
# The last 3 columns of the last row should not be None. The datetime is
# the only column that should be None because it hasn't been crawled yet.
# The last 3 columns of the last row should not be None. The
# synchronization status is the only column that should be False because it
# hasn't been crawled yet.
assert rgstrs_final.iloc[-1, -4:-1].notnull().all()


Expand Down
7 changes: 0 additions & 7 deletions tests/test_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,6 @@ def test_read_registrations_reads_file():
assert isinstance(rgstrs, pd.DataFrame)


def test_read_registrations_formats_datetime():
"""Formats the datetime column."""
rgstrs = read_registrations("tests/registrations.csv")
crawl_time = rgstrs["gbif_endpoint_set_datetime"]
assert pd.core.dtypes.common.is_datetime64_dtype(crawl_time)


def test_read_local_dataset_metadata_success(mocker, eml):
"""Test that read_local_dataset_metadata returns a string on success."""
mock_response = mocker.Mock()
Expand Down
16 changes: 8 additions & 8 deletions tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,22 +68,22 @@ def test_check_one_to_one_cardinality_warn(rgstrs):
assert rgstrs.loc[3, "local_dataset_endpoint"] in str(warns[0].message)


def test_check_crawl_datetime_valid(rgstrs):
def test_check_is_synchronized_valid(rgstrs):
"""The registrations file is valid, and doesn't throw a warning."""
with warnings.catch_warnings(record=True) as warns:
warnings.simplefilter("always")
validate.check_crawl_datetime(rgstrs)
validate.check_is_synchronized(rgstrs)
assert len(warns) == 0


def test_check_crawl_datetime_warn(rgstrs):
"""Uncrawled registrations result in a warning."""
rgstrs.loc[0, "gbif_endpoint_set_datetime"] = np.nan
rgstrs.loc[2, "gbif_endpoint_set_datetime"] = np.nan
def test_check_is_synchronized_warn(rgstrs):
"""Unsynchronized registrations result in a warning."""
rgstrs.loc[0, "is_synchronized"] = False
rgstrs.loc[2, "is_synchronized"] = False
with warnings.catch_warnings(record=True) as warns:
warnings.simplefilter("always")
validate.check_crawl_datetime(rgstrs)
assert "Uncrawled registrations in rows" in str(warns[0].message)
validate.check_is_synchronized(rgstrs)
assert "Unsynchronized registrations in rows" in str(warns[0].message)
assert "1, 3" in str(warns[0].message)


Expand Down

0 comments on commit 2c7ea77

Please sign in to comment.