Skip to content

Commit

Permalink
refactor: improve clarity in initialize_registrations
Browse files Browse the repository at this point in the history
- Rename the 'initialize_registrations' function to enhance understanding,
making it clear that it initializes a file.
- Enhance file content descriptions and their mappings to concepts in
the EDI repository for better comprehension.
- Move the function to the 'register.py' module, where it joins similar
code for improved findability.
  • Loading branch information
clnsmth authored Sep 27, 2023
1 parent 3b9cefb commit ee98c9e
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 72 deletions.
52 changes: 52 additions & 0 deletions src/gbif_registrar/register.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Functions for registering datasets with GBIF."""
import os.path
import json
import requests
import pandas as pd
Expand All @@ -11,6 +12,57 @@
)
from gbif_registrar.utilities import read_registrations
from gbif_registrar.utilities import get_local_dataset_endpoint
from gbif_registrar.utilities import expected_cols


def initialize_registrations_file(file_path):
"""Returns an empty registrations file.
The registrations file maps datasets from the local EDI data repository to
the remote GBIF registry and indicates synchronization status between the
two.
Parameters
----------
file_path : str
Path of file to be written. A .csv file extension is expected.
Returns
-------
None
The registrations file as a .csv.
Notes
-----
The registrations file columns and definitions are as follows:
- `local_dataset_id`: The dataset identifier in the EDI repository. This
is the primary key. The term 'dataset' used here, is synonymous with the
term 'data package' in the EDI repository.
- `local_dataset_group_id`: The dataset group identifier in the EDI
repository. This often forms a one-to-many relationship with
`local_dataset_id`. The term 'dataset group' used here, is synonymous
with the term 'data package series' in the EDI repository.
- `local_dataset_endpoint`: The endpoint for downloading the dataset from
the EDI repository. This forms a one-to-one relationship with
`local_dataset_id`.
- `gbif_dataset_uuid`: The registration identifier assigned by GBIF to the
`local_dataset_group_id`. This forms a one-to-one relationship with
`local_dataset_group_id`.
- `is_synchronized`: The synchronization status of the `local_dataset_id`
with GBIF. Is `True` if the local dataset is synchronized with GBIF, and
`False` if the local dataset is not synchronized with GBIF. This forms
a one-to-one relationship with `local_dataset_id`.
Examples
--------
>>> initialize_registrations_file("registrations.csv")
"""
if os.path.exists(file_path):
pass
else:
data = pd.DataFrame(columns=expected_cols())
data.to_csv(file_path, index=False, mode="x")


def register(file_path, local_dataset_id=None):
Expand Down
41 changes: 0 additions & 41 deletions src/gbif_registrar/utilities.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,11 @@
"""Miscellaneous utilities"""
import os.path
from json import loads
import pandas as pd
from lxml import etree
import requests
from gbif_registrar.config import PASTA_ENVIRONMENT, GBIF_API


def initialize_registrations(file_path):
"""Writes an empty registrations file to path.
The registrations file is a map from datasets in the local repository, to
identifiers in the remote GBIF registry. This file contains additional
information about the local datasets, as well as the synchronization
status of the local dataset with GBIF. The registrations file columns
(and definitions):
- `local_dataset_id`: The identifier of the dataset in the local
repository system. This is the primary key.
- `local_dataset_group_id`: An identifier for grouping datasets of the
same series. This can form a one-to-many relationship with
local_dataset_id.
- `local_dataset_endpoint`: The endpoint for the local dataset to be
crawled by GBIF. This generally has a one-to-one relationship with
`local_dataset_id`.
- `gbif_dataset_uuid`: The registration identifier assigned by GBIF to the
local dataset group. This has a one-to-one relationship with
`local_dataset_group_id`.
- `is_synchronized`: The synchronization status of the local dataset with
GBIF.
Parameters
----------
file_path : Any
Path of file to be written. A .csv file extension is expected.
Returns
-------
None
The registrations file as a .csv.
"""
if os.path.exists(file_path):
pass
else:
data = pd.DataFrame(columns=expected_cols())
data.to_csv(file_path, index=False, mode="x")


def read_registrations(file_path):
"""Reads the registrations file.
Expand Down
32 changes: 32 additions & 0 deletions tests/test_register.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,44 @@
"""Test register.py"""

import os.path
import hashlib
import pandas as pd
from gbif_registrar.utilities import read_registrations
from gbif_registrar.register import get_local_dataset_group_id
from gbif_registrar.register import get_local_dataset_endpoint
from gbif_registrar.register import get_gbif_dataset_uuid
from gbif_registrar.register import register
from gbif_registrar.register import request_gbif_dataset_uuid
from gbif_registrar.config import PASTA_ENVIRONMENT
from gbif_registrar.register import initialize_registrations_file
from gbif_registrar.utilities import expected_cols


def test_initialize_registrations_file_writes_to_path(tmp_path):
"""File is written to path."""
file = tmp_path / "registrations.csv"
initialize_registrations_file(file)
assert os.path.exists(file)


def test_initialize_registrations_file_does_not_overwrite(tmp_path):
"""Does not overwrite."""
file = tmp_path / "registrations.csv"
initialize_registrations_file(file)
with open(file, "rb") as rgstrs:
md5_before = hashlib.md5(rgstrs.read()).hexdigest()
with open(file, "rb") as rgstrs:
md5_after = hashlib.md5(rgstrs.read()).hexdigest()
assert md5_before == md5_after


def test_initialize_registrations_file_has_expected_columns(tmp_path):
"""Has expected columns."""
file = tmp_path / "registrations.csv"
initialize_registrations_file(file)
data = pd.read_csv(file, delimiter=",")
missing_cols = not set(expected_cols()).issubset(set(data.columns))
assert not missing_cols


def test_get_local_dataset_group_id(local_dataset_id):
Expand Down
31 changes: 0 additions & 31 deletions tests/test_utilities.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,14 @@
"""Test utilities"""

import os.path
import hashlib
from json import loads
import pandas as pd
from gbif_registrar.utilities import read_registrations
from gbif_registrar.utilities import initialize_registrations
from gbif_registrar.utilities import expected_cols
from gbif_registrar.utilities import read_local_dataset_metadata
from gbif_registrar.utilities import has_metadata
from gbif_registrar.utilities import read_gbif_dataset_metadata
from gbif_registrar.utilities import is_synchronized


def test_initialize_registrations_writes_to_path(tmp_path):
"""File is written to path."""
file = tmp_path / "registrations.csv"
initialize_registrations(file)
assert os.path.exists(file)


def test_initialize_registrations_does_not_overwrite(tmp_path):
"""Does not overwrite."""
file = tmp_path / "registrations.csv"
initialize_registrations(file)
with open(file, "rb") as rgstrs:
md5_before = hashlib.md5(rgstrs.read()).hexdigest()
with open(file, "rb") as rgstrs:
md5_after = hashlib.md5(rgstrs.read()).hexdigest()
assert md5_before == md5_after


def test_initialize_registrations_has_expected_columns(tmp_path):
"""Has expected columns."""
file = tmp_path / "registrations.csv"
initialize_registrations(file)
data = pd.read_csv(file, delimiter=",")
missing_cols = not set(expected_cols()).issubset(set(data.columns))
assert not missing_cols


def test_read_registrations_reads_file():
"""Reads the file."""
rgstrs = read_registrations("tests/registrations.csv")
Expand Down

0 comments on commit ee98c9e

Please sign in to comment.