From 2c7ea77e6f55d9a34eb6882a72bb1108bc808f49 Mon Sep 17 00:00:00 2001 From: Colin Smith Date: Tue, 26 Sep 2023 07:10:42 -0700 Subject: [PATCH] refactor: deprecate gbif_endpoint_set_datetime Deprecate gbif_endpoint_set_datetime in favor of is_synchronized to indicate the synchronization status of an EDI dataset with GBIF. Is related to c9ebad36bcacdc1351e171092f30cae06f13c035. --- src/gbif_registrar/register.py | 9 ++++----- src/gbif_registrar/utilities.py | 18 +++++++----------- src/gbif_registrar/validate.py | 28 ++++++++++++++-------------- tests/registrations.csv | 16 ++++++++-------- tests/test_register.py | 7 ++++--- tests/test_utilities.py | 7 ------- tests/test_validate.py | 16 ++++++++-------- 7 files changed, 45 insertions(+), 56 deletions(-) diff --git a/src/gbif_registrar/register.py b/src/gbif_registrar/register.py index f102cba..42a353c 100644 --- a/src/gbif_registrar/register.py +++ b/src/gbif_registrar/register.py @@ -57,7 +57,7 @@ def register(file_path, local_dataset_id=None): "local_dataset_group_id": None, "local_dataset_endpoint": None, "gbif_dataset_uuid": None, - "gbif_endpoint_set_datetime": None, + "is_synchronized": None, }, index=[0], ) @@ -74,8 +74,7 @@ def complete_registrations(rgstrs): Parameters ---------- rgstrs : DataFrame - Pandas dataframe with the gbif_endpoint_set_datetime column formatted as - datetime. + Pandas dataframe. Returns ------- @@ -84,13 +83,13 @@ def complete_registrations(rgstrs): """ # Get all rows where the rgstrs dataframe columns # local_dataset_group_id, local_dataset_endpoint, gbif_dataset_uuid, - # gbif_endpoint_set_datetime contain empty values. These are the rows + # is_synchronized contain empty values. These are the rows # that need to be completed. record = rgstrs[ (rgstrs["local_dataset_group_id"].isnull()) | (rgstrs["local_dataset_endpoint"].isnull()) | (rgstrs["gbif_dataset_uuid"].isnull()) - | (rgstrs["gbif_endpoint_set_datetime"].isnull()) + | (rgstrs["is_synchronized"].isnull()) ] # If the record dataframe is empty, then there are no rows to complete. # Return the rgstrs dataframe. diff --git a/src/gbif_registrar/utilities.py b/src/gbif_registrar/utilities.py index 5f58b80..8bb5aba 100644 --- a/src/gbif_registrar/utilities.py +++ b/src/gbif_registrar/utilities.py @@ -12,9 +12,9 @@ def initialize_registrations(file_path): The registrations file is a map from datasets in the local repository, to identifiers in the remote GBIF registry. This file contains additional - information about the local datasets, as well as the most recent datetime - GBIF crawled the local endpoint to synchronize the registry instance. The - registrations file columns (and definitions): + information about the local datasets, as well as the synchronization + status of the local dataset with GBIF. The registrations file columns + (and definitions): - `local_dataset_id`: The identifier of the dataset in the local repository system. This is the primary key. @@ -27,8 +27,8 @@ def initialize_registrations(file_path): - `gbif_dataset_uuid`: The registration identifier assigned by GBIF to the local dataset group. This has a one-to-one relationship with `local_dataset_group_id`. - - `gbif_endpoint_set_datetime`: The datetime GBIF crawled the - `local_dataset_endpoint`. + - `is_synchronized`: The synchronization status of the local dataset with + GBIF. Parameters ---------- @@ -58,17 +58,13 @@ def read_registrations(file_path): Returns ------- DataFrame - Pandas dataframe with the gbif_endpoint_set_datetime column formatted as - datetime. + Pandas dataframe. See Also -------- check_registrations_file """ rgstrs = pd.read_csv(file_path, delimiter=",") - rgstrs["gbif_endpoint_set_datetime"] = pd.to_datetime( - rgstrs["gbif_endpoint_set_datetime"] - ) return rgstrs @@ -79,7 +75,7 @@ def expected_cols(): "local_dataset_group_id", "local_dataset_endpoint", "gbif_dataset_uuid", - "gbif_endpoint_set_datetime", + "is_synchronized", ] return cols diff --git a/src/gbif_registrar/validate.py b/src/gbif_registrar/validate.py index 188afca..6c07990 100644 --- a/src/gbif_registrar/validate.py +++ b/src/gbif_registrar/validate.py @@ -9,7 +9,7 @@ def check_completeness(rgstrs): """Checks registrations for completeness. A complete registration has values for all fields except (perhaps) - `gbif_endpoint_set_datetime`, which is not essential for initiating a GBIF + `is_synchronized`, which is not essential for initiating a GBIF crawl. Parameters @@ -27,7 +27,7 @@ def check_completeness(rgstrs): UserWarning If any registrations are incomplete. """ - rgstrs = rgstrs[expected_cols()].drop(["gbif_endpoint_set_datetime"], axis=1) + rgstrs = rgstrs[expected_cols()].drop(["is_synchronized"], axis=1) rgstrs = rgstrs[rgstrs.isna().any(axis=1)] if len(rgstrs) > 0: rows = rgstrs.index.to_series() + 1 @@ -173,12 +173,13 @@ def check_local_endpoints(rgstrs): ) -def check_crawl_datetime(rgstrs): - """Checks if registrations have been crawled. +def check_is_synchronized(rgstrs): + """Checks if registrations have been synchronized. Registrations contain all the information needed for GBIF to successfully - crawl the corresponding dataset and post to the GBIF data portal. Datetime - values in the `gbif_endpoint_set_datetime` indicate the dataset has been crawled. + crawl the corresponding dataset and post to the GBIF data portal. Boolean + True/False values in the `is_synchronized` field indicate the dataset is + synchronized. Parameters ---------- @@ -197,13 +198,12 @@ def check_crawl_datetime(rgstrs): Examples -------- >>> rgstrs = read_registrations('tests/registrations.csv') - >>> check_crawl_datetime(rgstrs) + >>> check_is_synchronized(rgstrs) """ - uncrawled = rgstrs["gbif_endpoint_set_datetime"].isna() - if any(uncrawled): - rows = rgstrs[uncrawled].index.to_series() + 1 - rows = rows.astype("string") - warnings.warn("Uncrawled registrations in rows: " + ", ".join(rows)) + if not rgstrs["is_synchronized"].all(): + rows = rgstrs["is_synchronized"].index.to_series() + 1 + rows = rows[~rgstrs["is_synchronized"]].astype("string") + warnings.warn("Unsynchronized registrations in rows: " + ", ".join(rows)) def check_local_dataset_id_format(rgstrs): @@ -278,7 +278,7 @@ def validate_registrations(file_path, extended_checks=False): This is a wrapper to `check_completeness`, `check_local_dataset_id`, `check_group_registrations`, `check_local_endpoints`, and - `check_crawl_datetime`. + `check_is_synchronized`. Parameters ---------- @@ -308,7 +308,7 @@ def validate_registrations(file_path, extended_checks=False): check_local_dataset_id(rgstrs) check_group_registrations(rgstrs) check_local_endpoints(rgstrs) - check_crawl_datetime(rgstrs) + check_is_synchronized(rgstrs) if extended_checks: check_local_dataset_id_format(rgstrs) check_local_dataset_group_id_format(rgstrs) diff --git a/tests/registrations.csv b/tests/registrations.csv index 6f106bc..848e225 100644 --- a/tests/registrations.csv +++ b/tests/registrations.csv @@ -1,8 +1,8 @@ -local_dataset_id,local_dataset_group_id,local_dataset_endpoint,gbif_dataset_uuid,gbif_endpoint_set_datetime -edi.193.4,edi.193,https://pasta.lternet.edu/package/archive/eml/edi/193/4/archive_edi.193.4_15494687022457218,e44c5367-9d09-4328-9a5a-d0f41fb22d61,2021-04-21 14:32:00 -edi.193.5,edi.193,https://pasta.lternet.edu/package/archive/eml/edi/193/5/archive_edi.193.5_18246176635637788,e44c5367-9d09-4328-9a5a-d0f41fb22d61,2022-12-05 03:45:00 -edi.356.1,edi.356,https://pasta.lternet.edu/package/archive/eml/edi/356/1/archive_edi.356.1_74665239205233345,8c30c4a7-2f63-4421-83c0-60f3d3b195b1,2021-04-21 10:00:00 -edi.356.2,edi.356,https://pasta.lternet.edu/package/archive/eml/edi/356/2/archive_edi.356.2_74665239205231111,8c30c4a7-2f63-4421-83c0-60f3d3b195b1,2022-12-05 02:00:00 -knb-lter-msp.1.1,knb-lter-msp.1,https://pasta.lternet.edu/package/archive/eml/knb-lter-msp/1/1/archive_knb-lter-msp.1.1_33365239205233345,8c30c4a7-4444-4421-83c0-60f3d3b195b1,2022-12-21 09:00:00 -knb-lter-msp.1.2,knb-lter-msp.1,https://pasta.lternet.edu/package/archive/eml/knb-lter-msp/1/2/archive_knb-lter-msp.1.2_32165239205231111,8c30c4a7-4444-4421-83c0-60f3d3b195b1,2022-12-22 10:00:00 -edi.941.3,edi.941,https://pasta.lternet.edu/package/download/eml/edi/941/3,cfb3f6d5-ed7d-4fff-9f1b-f032ed1de485,2023-06-22 00:00:00 \ No newline at end of file +local_dataset_id,local_dataset_group_id,local_dataset_endpoint,gbif_dataset_uuid,is_synchronized +edi.193.4,edi.193,https://pasta.lternet.edu/package/archive/eml/edi/193/4/archive_edi.193.4_15494687022457218,e44c5367-9d09-4328-9a5a-d0f41fb22d61,True +edi.193.5,edi.193,https://pasta.lternet.edu/package/archive/eml/edi/193/5/archive_edi.193.5_18246176635637788,e44c5367-9d09-4328-9a5a-d0f41fb22d61,True +edi.356.1,edi.356,https://pasta.lternet.edu/package/archive/eml/edi/356/1/archive_edi.356.1_74665239205233345,8c30c4a7-2f63-4421-83c0-60f3d3b195b1,True +edi.356.2,edi.356,https://pasta.lternet.edu/package/archive/eml/edi/356/2/archive_edi.356.2_74665239205231111,8c30c4a7-2f63-4421-83c0-60f3d3b195b1,True +knb-lter-msp.1.1,knb-lter-msp.1,https://pasta.lternet.edu/package/archive/eml/knb-lter-msp/1/1/archive_knb-lter-msp.1.1_33365239205233345,8c30c4a7-4444-4421-83c0-60f3d3b195b1,True +knb-lter-msp.1.2,knb-lter-msp.1,https://pasta.lternet.edu/package/archive/eml/knb-lter-msp/1/2/archive_knb-lter-msp.1.2_32165239205231111,8c30c4a7-4444-4421-83c0-60f3d3b195b1,True +edi.941.3,edi.941,https://pasta.lternet.edu/package/download/eml/edi/941/3,cfb3f6d5-ed7d-4fff-9f1b-f032ed1de485,True \ No newline at end of file diff --git a/tests/test_register.py b/tests/test_register.py index 8413b70..4a60258 100644 --- a/tests/test_register.py +++ b/tests/test_register.py @@ -59,7 +59,7 @@ def test_get_gbif_dataset_uuid_does_not_exist(rgstrs, mocker): "local_dataset_endpoint" ] = "https://pasta.lternet.edu/package/download/eml/edi/929/1" new_row["gbif_dataset_uuid"] = None - new_row["gbif_endpoint_set_datetime"] = None + new_row["is_synchronized"] = False rgstrs = rgstrs.append(new_row, ignore_index=True) # Run the get_gbif_dataset_uuid function and check that it returns the new # GBIF dataset UUID. @@ -147,8 +147,9 @@ def test_register_repairs_failed_registration( assert rgstrs_final.shape[0] == rgstrs_initial.shape[0] assert rgstrs_final.iloc[-1]["local_dataset_id"] == local_dataset_id assert rgstrs_final.iloc[-1]["gbif_dataset_uuid"] == gbif_dataset_uuid - # The last 3 columns of the last row should not be None. The datetime is - # the only column that should be None because it hasn't been crawled yet. + # The last 3 columns of the last row should not be None. The + # synchronization status is the only column that should be False because it + # hasn't been crawled yet. assert rgstrs_final.iloc[-1, -4:-1].notnull().all() diff --git a/tests/test_utilities.py b/tests/test_utilities.py index 2e8b723..0623cab 100644 --- a/tests/test_utilities.py +++ b/tests/test_utilities.py @@ -46,13 +46,6 @@ def test_read_registrations_reads_file(): assert isinstance(rgstrs, pd.DataFrame) -def test_read_registrations_formats_datetime(): - """Formats the datetime column.""" - rgstrs = read_registrations("tests/registrations.csv") - crawl_time = rgstrs["gbif_endpoint_set_datetime"] - assert pd.core.dtypes.common.is_datetime64_dtype(crawl_time) - - def test_read_local_dataset_metadata_success(mocker, eml): """Test that read_local_dataset_metadata returns a string on success.""" mock_response = mocker.Mock() diff --git a/tests/test_validate.py b/tests/test_validate.py index 6095e74..fed8cc5 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -68,22 +68,22 @@ def test_check_one_to_one_cardinality_warn(rgstrs): assert rgstrs.loc[3, "local_dataset_endpoint"] in str(warns[0].message) -def test_check_crawl_datetime_valid(rgstrs): +def test_check_is_synchronized_valid(rgstrs): """The registrations file is valid, and doesn't throw a warning.""" with warnings.catch_warnings(record=True) as warns: warnings.simplefilter("always") - validate.check_crawl_datetime(rgstrs) + validate.check_is_synchronized(rgstrs) assert len(warns) == 0 -def test_check_crawl_datetime_warn(rgstrs): - """Uncrawled registrations result in a warning.""" - rgstrs.loc[0, "gbif_endpoint_set_datetime"] = np.nan - rgstrs.loc[2, "gbif_endpoint_set_datetime"] = np.nan +def test_check_is_synchronized_warn(rgstrs): + """Unsynchronized registrations result in a warning.""" + rgstrs.loc[0, "is_synchronized"] = False + rgstrs.loc[2, "is_synchronized"] = False with warnings.catch_warnings(record=True) as warns: warnings.simplefilter("always") - validate.check_crawl_datetime(rgstrs) - assert "Uncrawled registrations in rows" in str(warns[0].message) + validate.check_is_synchronized(rgstrs) + assert "Unsynchronized registrations in rows" in str(warns[0].message) assert "1, 3" in str(warns[0].message)