Skip to content

Commit

Permalink
refactor: clarify definition of 'synchronization'
Browse files Browse the repository at this point in the history
Rename the 'is_synchronized' column to 'synchronized' to clarify its
meaning, shifting from "this dataset is currently synchronized with
GBIF" to "this dataset has in the past been synchronized with GBIF."
Also, updated the 'check_is_synchronized' function to align with this
renaming.
  • Loading branch information
clnsmth authored Oct 11, 2023
1 parent f23fc23 commit 2cb5392
Show file tree
Hide file tree
Showing 6 changed files with 20 additions and 20 deletions.
16 changes: 8 additions & 8 deletions src/gbif_registrar/_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def _check_completeness(rgstrs):
"""Checks registrations for completeness.
A complete registration has values for all fields except (perhaps)
`is_synchronized`, which is not essential for initiating a GBIF
`synchronized`, which is not essential for initiating a GBIF
crawl.
Parameters
Expand All @@ -39,7 +39,7 @@ def _check_completeness(rgstrs):
UserWarning
If any registrations are incomplete.
"""
rgstrs = rgstrs[_expected_cols()].drop(["is_synchronized"], axis=1)
rgstrs = rgstrs[_expected_cols()].drop(["synchronized"], axis=1)
rgstrs = rgstrs[rgstrs.isna().any(axis=1)]
if len(rgstrs) > 0:
rows = rgstrs.index.to_series() + 1
Expand Down Expand Up @@ -73,12 +73,12 @@ def _check_group_registrations(rgstrs):
)


def _check_is_synchronized(rgstrs):
def _check_synchronized(rgstrs):
"""Checks if registrations have been synchronized.
Registrations contain all the information needed for GBIF to successfully
crawl the corresponding dataset and post to the GBIF data portal. Boolean
True/False values in the `is_synchronized` field indicate the dataset is
True/False values in the `synchronized` field indicate the dataset has been
synchronized.
Parameters
Expand All @@ -95,9 +95,9 @@ def _check_is_synchronized(rgstrs):
-----
If a registration has not yet been crawled.
"""
if not rgstrs["is_synchronized"].all():
rows = rgstrs["is_synchronized"].index.to_series() + 1
rows = rows[~rgstrs["is_synchronized"]].astype("string")
if not rgstrs["synchronized"].all():
rows = rgstrs["synchronized"].index.to_series() + 1
rows = rows[~rgstrs["synchronized"]].astype("string")
warnings.warn("Unsynchronized registrations in rows: " + ", ".join(rows))


Expand Down Expand Up @@ -295,7 +295,7 @@ def _expected_cols():
"local_dataset_group_id",
"local_dataset_endpoint",
"gbif_dataset_uuid",
"is_synchronized",
"synchronized",
]
return cols

Expand Down
4 changes: 2 additions & 2 deletions src/gbif_registrar/register.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def register_dataset(local_dataset_id, registrations_file):
"local_dataset_group_id": None,
"local_dataset_endpoint": None,
"gbif_dataset_uuid": None,
"is_synchronized": False,
"synchronized": False,
},
index=[0],
)
Expand Down Expand Up @@ -138,7 +138,7 @@ def complete_registration_records(registrations_file):
registrations = _read_registrations_file(registrations_file)
# Get all rows where the registrations dataframe columns
# local_dataset_group_id, local_dataset_endpoint, gbif_dataset_uuid,
# is_synchronized contain empty values. These are the rows
# synchronized contain empty values. These are the rows
# that need to be completed.
record = registrations[
(registrations["local_dataset_group_id"].isnull())
Expand Down
6 changes: 3 additions & 3 deletions src/gbif_registrar/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from gbif_registrar._utilities import _check_local_dataset_id
from gbif_registrar._utilities import _check_group_registrations
from gbif_registrar._utilities import _check_local_endpoints
from gbif_registrar._utilities import _check_is_synchronized
from gbif_registrar._utilities import _check_synchronized
from gbif_registrar._utilities import _check_local_dataset_id_format
from gbif_registrar._utilities import _check_local_dataset_group_id_format

Expand All @@ -15,7 +15,7 @@ def validate_registrations(file_path):
This is a wrapper to `_check_completeness`, `_check_local_dataset_id`,
`_check_group_registrations`, `_check_local_endpoints`, and
`_check_is_synchronized`.
`_check_synchronized`.
Parameters
----------
Expand All @@ -40,6 +40,6 @@ def validate_registrations(file_path):
_check_local_dataset_id(rgstrs)
_check_group_registrations(rgstrs)
_check_local_endpoints(rgstrs)
_check_is_synchronized(rgstrs)
_check_synchronized(rgstrs)
_check_local_dataset_id_format(rgstrs)
_check_local_dataset_group_id_format(rgstrs)
2 changes: 1 addition & 1 deletion tests/registrations.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
local_dataset_id,local_dataset_group_id,local_dataset_endpoint,gbif_dataset_uuid,is_synchronized
local_dataset_id,local_dataset_group_id,local_dataset_endpoint,gbif_dataset_uuid,synchronized
edi.193.4,edi.193,https://pasta.lternet.edu/package/archive/eml/edi/193/4/archive_edi.193.4_15494687022457218,e44c5367-9d09-4328-9a5a-d0f41fb22d61,True
edi.193.5,edi.193,https://pasta.lternet.edu/package/archive/eml/edi/193/5/archive_edi.193.5_18246176635637788,e44c5367-9d09-4328-9a5a-d0f41fb22d61,True
edi.356.1,edi.356,https://pasta.lternet.edu/package/archive/eml/edi/356/1/archive_edi.356.1_74665239205233345,8c30c4a7-2f63-4421-83c0-60f3d3b195b1,True
Expand Down
10 changes: 5 additions & 5 deletions tests/test__utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
_check_completeness,
_check_local_dataset_id,
_check_one_to_one_cardinality,
_check_is_synchronized,
_check_synchronized,
_check_local_dataset_id_format,
_check_local_dataset_group_id_format,
_read_registrations_file,
Expand Down Expand Up @@ -90,17 +90,17 @@ def test_check_is_synchronized_valid(rgstrs):
"""The registrations file is valid, and doesn't throw a warning."""
with warnings.catch_warnings(record=True) as warns:
warnings.simplefilter("always")
_check_is_synchronized(rgstrs)
_check_synchronized(rgstrs)
assert len(warns) == 0


def test_check_is_synchronized_warn(rgstrs):
"""Unsynchronized registrations result in a warning."""
rgstrs.loc[0, "is_synchronized"] = False
rgstrs.loc[2, "is_synchronized"] = False
rgstrs.loc[0, "synchronized"] = False
rgstrs.loc[2, "synchronized"] = False
with warnings.catch_warnings(record=True) as warns:
warnings.simplefilter("always")
_check_is_synchronized(rgstrs)
_check_synchronized(rgstrs)
assert "Unsynchronized registrations in rows" in str(warns[0].message)
assert "1, 3" in str(warns[0].message)

Expand Down
2 changes: 1 addition & 1 deletion tests/test_register.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def test_read_registrations_casts_dtypes():
assert rgstrs["local_dataset_group_id"].dtype == "string"
assert rgstrs["local_dataset_endpoint"].dtype == "string"
assert rgstrs["gbif_dataset_uuid"].dtype == "string"
isinstance(rgstrs["is_synchronized"].dtype, pd.BooleanDtype)
isinstance(rgstrs["synchronized"].dtype, pd.BooleanDtype)


def test_register_dataset_success(
Expand Down

0 comments on commit 2cb5392

Please sign in to comment.