refactor: improve clarity in initialize_registrations

- Rename the 'initialize_registrations' function to enhance understanding, making it clear that it initializes a file. - Enhance file content descriptions and their mappings to concepts in the EDI repository for better comprehension. - Move the function to the 'register.py' module, where it joins similar code for improved findability.
EDIorg · Sep 27, 2023 · ee98c9e · ee98c9e
1 parent 3b9cefb
commit ee98c9e
Show file tree

Hide file tree

Showing 4 changed files with 84 additions and 72 deletions.
diff --git a/src/gbif_registrar/register.py b/src/gbif_registrar/register.py
@@ -1,4 +1,5 @@
 """Functions for registering datasets with GBIF."""
+import os.path
 import json
 import requests
 import pandas as pd
@@ -11,6 +12,57 @@
 )
 from gbif_registrar.utilities import read_registrations
 from gbif_registrar.utilities import get_local_dataset_endpoint
+from gbif_registrar.utilities import expected_cols
+
+
+def initialize_registrations_file(file_path):
+    """Returns an empty registrations file.
+
+    The registrations file maps datasets from the local EDI data repository to
+    the remote GBIF registry and indicates synchronization status between the
+    two.
+
+    Parameters
+    ----------
+    file_path : str
+        Path of file to be written. A .csv file extension is expected.
+
+    Returns
+    -------
+    None
+        The registrations file as a .csv.
+
+    Notes
+    -----
+    The registrations file columns and definitions are as follows:
+
+    - `local_dataset_id`: The dataset identifier in the EDI repository. This
+      is the primary key. The term 'dataset' used here, is synonymous with the
+      term 'data package' in the EDI repository.
+    - `local_dataset_group_id`: The dataset group identifier in the EDI
+      repository. This often forms a one-to-many relationship with
+      `local_dataset_id`. The term 'dataset group' used here, is synonymous
+      with the term 'data package series' in the EDI repository.
+    - `local_dataset_endpoint`: The endpoint for downloading the dataset from
+      the EDI repository. This forms a one-to-one relationship with
+      `local_dataset_id`.
+    - `gbif_dataset_uuid`: The registration identifier assigned by GBIF to the
+      `local_dataset_group_id`. This forms a one-to-one relationship with
+      `local_dataset_group_id`.
+    - `is_synchronized`: The synchronization status of the `local_dataset_id`
+      with GBIF. Is `True` if the local dataset is synchronized with GBIF, and
+      `False` if the local dataset is not synchronized with GBIF. This forms
+      a one-to-one relationship with `local_dataset_id`.
+
+    Examples
+    --------
+    >>> initialize_registrations_file("registrations.csv")
+    """
+    if os.path.exists(file_path):
+        pass
+    else:
+        data = pd.DataFrame(columns=expected_cols())
+        data.to_csv(file_path, index=False, mode="x")
 
 
 def register(file_path, local_dataset_id=None):

diff --git a/src/gbif_registrar/utilities.py b/src/gbif_registrar/utilities.py
@@ -1,52 +1,11 @@
 """Miscellaneous utilities"""
-import os.path
 from json import loads
 import pandas as pd
 from lxml import etree
 import requests
 from gbif_registrar.config import PASTA_ENVIRONMENT, GBIF_API
 
 
-def initialize_registrations(file_path):
-    """Writes an empty registrations file to path.
-
-    The registrations file is a map from datasets in the local repository, to
-    identifiers in the remote GBIF registry. This file contains additional
-    information about the local datasets, as well as the synchronization
-    status of the local dataset with GBIF. The registrations file columns
-    (and definitions):
-
-    - `local_dataset_id`: The identifier of the dataset in the local
-      repository system. This is the primary key.
-    - `local_dataset_group_id`: An identifier for grouping datasets of the
-      same series. This can form a one-to-many relationship with
-      local_dataset_id.
-    - `local_dataset_endpoint`: The endpoint for the local dataset to be
-      crawled by GBIF. This generally has a one-to-one relationship with
-      `local_dataset_id`.
-    - `gbif_dataset_uuid`: The registration identifier assigned by GBIF to the
-      local dataset group. This has a one-to-one relationship with
-      `local_dataset_group_id`.
-    - `is_synchronized`: The synchronization status of the local dataset with
-      GBIF.
-
-    Parameters
-    ----------
-    file_path : Any
-        Path of file to be written. A .csv file extension is expected.
-
-    Returns
-    -------
-    None
-        The registrations file as a .csv.
-    """
-    if os.path.exists(file_path):
-        pass
-    else:
-        data = pd.DataFrame(columns=expected_cols())
-        data.to_csv(file_path, index=False, mode="x")
-
-
 def read_registrations(file_path):
     """Reads the registrations file.
 

diff --git a/tests/test_register.py b/tests/test_register.py
@@ -1,12 +1,44 @@
 """Test register.py"""
 
+import os.path
+import hashlib
+import pandas as pd
 from gbif_registrar.utilities import read_registrations
 from gbif_registrar.register import get_local_dataset_group_id
 from gbif_registrar.register import get_local_dataset_endpoint
 from gbif_registrar.register import get_gbif_dataset_uuid
 from gbif_registrar.register import register
 from gbif_registrar.register import request_gbif_dataset_uuid
 from gbif_registrar.config import PASTA_ENVIRONMENT
+from gbif_registrar.register import initialize_registrations_file
+from gbif_registrar.utilities import expected_cols
+
+
+def test_initialize_registrations_file_writes_to_path(tmp_path):
+    """File is written to path."""
+    file = tmp_path / "registrations.csv"
+    initialize_registrations_file(file)
+    assert os.path.exists(file)
+
+
+def test_initialize_registrations_file_does_not_overwrite(tmp_path):
+    """Does not overwrite."""
+    file = tmp_path / "registrations.csv"
+    initialize_registrations_file(file)
+    with open(file, "rb") as rgstrs:
+        md5_before = hashlib.md5(rgstrs.read()).hexdigest()
+    with open(file, "rb") as rgstrs:
+        md5_after = hashlib.md5(rgstrs.read()).hexdigest()
+    assert md5_before == md5_after
+
+
+def test_initialize_registrations_file_has_expected_columns(tmp_path):
+    """Has expected columns."""
+    file = tmp_path / "registrations.csv"
+    initialize_registrations_file(file)
+    data = pd.read_csv(file, delimiter=",")
+    missing_cols = not set(expected_cols()).issubset(set(data.columns))
+    assert not missing_cols
 
 
 def test_get_local_dataset_group_id(local_dataset_id):

diff --git a/tests/test_utilities.py b/tests/test_utilities.py
@@ -1,45 +1,14 @@
 """Test utilities"""
 
-import os.path
-import hashlib
 from json import loads
 import pandas as pd
 from gbif_registrar.utilities import read_registrations
-from gbif_registrar.utilities import initialize_registrations
-from gbif_registrar.utilities import expected_cols
 from gbif_registrar.utilities import read_local_dataset_metadata
 from gbif_registrar.utilities import has_metadata
 from gbif_registrar.utilities import read_gbif_dataset_metadata
 from gbif_registrar.utilities import is_synchronized
 
 
-def test_initialize_registrations_writes_to_path(tmp_path):
-    """File is written to path."""
-    file = tmp_path / "registrations.csv"
-    initialize_registrations(file)
-    assert os.path.exists(file)
-
-
-def test_initialize_registrations_does_not_overwrite(tmp_path):
-    """Does not overwrite."""
-    file = tmp_path / "registrations.csv"
-    initialize_registrations(file)
-    with open(file, "rb") as rgstrs:
-        md5_before = hashlib.md5(rgstrs.read()).hexdigest()
-    with open(file, "rb") as rgstrs:
-        md5_after = hashlib.md5(rgstrs.read()).hexdigest()
-    assert md5_before == md5_after
-
-
-def test_initialize_registrations_has_expected_columns(tmp_path):
-    """Has expected columns."""
-    file = tmp_path / "registrations.csv"
-    initialize_registrations(file)
-    data = pd.read_csv(file, delimiter=",")
-    missing_cols = not set(expected_cols()).issubset(set(data.columns))
-    assert not missing_cols
-
-
 def test_read_registrations_reads_file():
     """Reads the file."""
     rgstrs = read_registrations("tests/registrations.csv")