From f3a69f2fbcaa1b86e0690acec35f307bbbf3b0bc Mon Sep 17 00:00:00 2001 From: Aaron Collier Date: Mon, 30 Sep 2024 15:38:35 -0700 Subject: [PATCH 1/3] Add IIIF_v3 Driver to support hashed metadata Refactoring Refactoring to improve complexity --- catalogs/aub.yaml | 20 +- dlme_airflow/drivers/__init__.py | 2 + dlme_airflow/drivers/iiif_json_v3.py | 203 ++++++++++++++++++++ dlme_airflow/utils/partition_url_builder.py | 17 +- tests/drivers/test_iiif_json_v3.py | 180 +++++++++++++++++ 5 files changed, 407 insertions(+), 15 deletions(-) create mode 100644 dlme_airflow/drivers/iiif_json_v3.py create mode 100644 tests/drivers/test_iiif_json_v3.py diff --git a/catalogs/aub.yaml b/catalogs/aub.yaml index a10b66b1..1db002fe 100644 --- a/catalogs/aub.yaml +++ b/catalogs/aub.yaml @@ -21,22 +21,22 @@ sources: header: "http://www.openarchives.org/OAI/2.0/" optional: true aladab: - driver: oai_xml + driver: iiif_json_v3 args: - collection_url: https://libraries.aub.edu.lb/xtf/oai - metadata_prefix: oai_dc - set: "aladab" - allow_expiration: true - full_harvest: true + collection_url: https://libraries.aub.edu.lb/iiifservices/collection/al-Adab + paging: + pages_url: https://libraries.aub.edu.lb/iiifservices/collection/al-Adab/{offset}/{limit} + page_data: items + page_fields: + - id + - thumbnail.id + limit: 1000 metadata: data_path: aub/aladab config: aub fields: id: - path: "//header:identifier" - namespace: - header: "http://www.openarchives.org/OAI/2.0/" - optional: true + path: "id" postcards: driver: oai_xml args: diff --git a/dlme_airflow/drivers/__init__.py b/dlme_airflow/drivers/__init__.py index c2b34bfd..1cb416e1 100644 --- a/dlme_airflow/drivers/__init__.py +++ b/dlme_airflow/drivers/__init__.py @@ -1,6 +1,7 @@ import intake from dlme_airflow.drivers.iiif_json import IiifJsonSource +from dlme_airflow.drivers.iiif_json_v3 import IiifV3JsonSource from dlme_airflow.drivers.oai_xml import OaiXmlSource from dlme_airflow.drivers.xml import XmlSource from dlme_airflow.drivers.sequential_csv import SequentialCsvSource @@ -9,6 +10,7 @@ def register_drivers(): intake.source.register_driver("iiif_json", IiifJsonSource) + intake.source.register_driver("iiif_json_v3", IiifV3JsonSource) intake.source.register_driver("oai_xml", OaiXmlSource) intake.source.register_driver("xml", XmlSource) intake.source.register_driver("sequential_csv", SequentialCsvSource) diff --git a/dlme_airflow/drivers/iiif_json_v3.py b/dlme_airflow/drivers/iiif_json_v3.py new file mode 100644 index 00000000..b200b7ec --- /dev/null +++ b/dlme_airflow/drivers/iiif_json_v3.py @@ -0,0 +1,203 @@ +import logging +import intake +import requests +import jsonpath_ng +import pandas as pd +from typing import Any, Optional, Generator +from dlme_airflow.utils.partition_url_builder import PartitionBuilder + + +class IiifV3JsonSource(intake.source.base.DataSource): + container = "dataframe" + name = "iiif_json_v3" + version = "0.0.2" + partition_access = True + + def __init__( + self, + collection_url, + paging=None, + metadata=None + ): + super(IiifV3JsonSource, self).__init__(metadata=metadata) + self.collection_url = collection_url + self.paging = paging + self._manifest_urls = [] + self._path_expressions = {} + self.record_count = 0 + self.record_limit = self.metadata.get("record_limit") + + + def _open_collection(self): + self._manifest_urls = self._get_manifest_urls() + + def _get_manifest_urls(self): + if self.paging: + return PartitionBuilder(self.collection_url, self.paging).urls() + else: + return self._get_manifest_urls_from_items() + + + def _get_manifest_urls_from_items(self): + resp = self._get(self.collection_url) + collection_result = resp.json() + urls = [] + if "items" in collection_result: # IIIF v3 + manifests = collection_result["items"] + else: + raise Exception( + f"Unknown collection manifest format: {self.collection_url}" + ) + + for manifest in manifests: + if "@id" in manifest: + url = manifest["@id"] # valid in IIIF v2 or v3 + elif "id" in manifest: + url = manifest["id"] # valid in IIIF v3 only + else: + raise Exception(f"Unknown URL in manifest: {manifest}") + urls.append(url) + + return urls + + + def _open_manifest(self, manifest_url: str) -> Optional[dict]: + resp = self._get(manifest_url) + if resp.status_code == 200: + manifest_result = resp.json() + else: + logging.error( + f"got {resp.status_code} when fetching manifest {manifest_url}" + ) + return None + + record = self._extract_specified_fields(manifest_result) + + # Handles metadata in IIIF manifest + record.update( + self._extract_manifest_metadata(manifest_result.get("metadata", [])) + ) + return record + + def _extract_specified_fields(self, iiif_manifest: dict) -> dict: + output: dict [str, Any] = {} + for name, info in self.metadata.get("fields").items(): + result = self._get_data_for_field(name, iiif_manifest) + + if not result: + self._optional_field_warning(iiif_manifest.get("id"), name, self._path_expressions.get(name), info.get("optional")) + continue + + processed_result = _stringify_and_strip_if_list(result) + + if name in output: + output.update({name: _flatten_list([output[name], processed_result])}) + else: + output[name] = processed_result + + return output + + + def _get_data_for_field(self, field, manifest): + expression = self._path_expressions.get(field) + return [match.value for match in expression.find(manifest)] + + def _optional_field_warning(self, id, field, expression, optional): + if optional is True: + logging.debug(f"{id} missing optional field: '{field}'; searched path: '{expression}'") + return + + logging.warning(f"{id} missing required field: '{field}'; searched path: '{expression}'") + + + def _extract_manifest_metadata( + self, iiif_manifest_metadata + ) -> dict[str, list[str]]: + output: dict[str, list[str]] = {} + + for row in iiif_manifest_metadata: + for key in row.get("label").keys(): + name = ( + row.get("label")[key][0] + .replace(" ", "-") + .lower() + .replace("(", "") + .replace(")", "") + ) + # initialize or append to output[name] based on whether we've seen the label + metadata_value = row.get("value")[key] + if not metadata_value: + continue + + if name in output: + output.update({name: _flatten_list([output[name], metadata_value])}) + else: + output[name] = metadata_value + + # flatten any nested lists into a single list + return {k: list(_flatten_list(v)) for (k, v) in output.items()} + + def _get_partition(self, i) -> pd.DataFrame: + # if we are over the defined limit return an empty DataFrame right away + if self.record_limit is not None and self.record_count > self.record_limit: + return pd.DataFrame() + + result = self._open_manifest(self._manifest_urls[i]) + + # If the dictionary has AT LEAST one value that is not None return a + # DataFrame with the keys as columns, and the values as a row. + # Otherwise return an empty DataFrame that can be concatenated. + # This will prevent rows with all empty values from being generated + # For context see https://github.com/sul-dlss/dlme-airflow/issues/192 + + if result is not None and any(result.values()): + self.record_count += 1 + return pd.DataFrame([result]) + else: + logging.warning(f"{self._manifest_urls[i]} resulted in empty DataFrame") + return pd.DataFrame() + + def _get_schema(self): + for name, info in self.metadata.get("fields", {}).items(): + self._path_expressions[name] = jsonpath_ng.parse(info.get("path")) + self._open_collection() + return intake.source.base.Schema( + datashape=None, + dtype=self.dtype, + shape=None, + npartitions=len(self._manifest_urls), + extra_metadata={}, + ) + + def _get(self, url): + return requests.get(url) + + def read(self): + self._load_metadata() + df = pd.concat([self.read_partition(i) for i in range(self.npartitions)]) + if self.record_limit: + return df.head(self.record_limit) + else: + return df + + +def _stringify_and_strip_if_list(record) -> list[str]: + if isinstance(record, str): + return str(record).strip() + + result_list = [] + for data in record: + result_list.append(_stringify_and_strip_if_list(data)) + + if len(result_list) == 1: + return result_list[0] + + return result_list + + +def _flatten_list(lst: list) -> Generator: + for item in lst: + if type(item) is list: + yield from _flatten_list(item) + else: + yield item diff --git a/dlme_airflow/utils/partition_url_builder.py b/dlme_airflow/utils/partition_url_builder.py index c1bb2f9d..55478624 100644 --- a/dlme_airflow/utils/partition_url_builder.py +++ b/dlme_airflow/utils/partition_url_builder.py @@ -1,6 +1,6 @@ import requests import jsonpath_ng - +import validators class PartitionBuilder: """Determine the method used to extract or format the @@ -65,13 +65,12 @@ def _prefetch_page_urls(self): harvested = 0 ids = [] while True: - api_endpoint = f"{self.paging_config['pages_url']}?limit={self.paging_config['limit']}&offset={offset}" - data = self._fetch_provider_data(api_endpoint)["data"] + api_endpoint = self.paging_config['pages_url'].format(offset=offset,limit=self.paging_config['limit']) + data = self._fetch_provider_data(api_endpoint)[self.paging_config['page_data']] offset += self.paging_config["limit"] harvested = len(data) - for i in data: - ids.append(f"{self.collection_url}{i['id']}") + ids += self._extract_ids(data) if harvested < self.paging_config["limit"]: break @@ -86,3 +85,11 @@ def _fetch_provider_data(self, url): resp = requests.get(url, headers=headers) if resp.status_code == 200: return resp.json() + + def _extract_ids(self, data): + return [self._format_id(i['id']) for i in data] + + def _format_id(self, id): + if validators.url(id): + return id + return f"{self.collection_url}{id}" diff --git a/tests/drivers/test_iiif_json_v3.py b/tests/drivers/test_iiif_json_v3.py new file mode 100644 index 00000000..89a4b6e2 --- /dev/null +++ b/tests/drivers/test_iiif_json_v3.py @@ -0,0 +1,180 @@ +import logging +import pytest +import requests +import pandas as pd + +from dlme_airflow.drivers.iiif_json_v3 import IiifV3JsonSource + +LOGGER = logging.getLogger(__name__) + + +class MockIIIFCollectionV2Response: + @property + def status_code(self): + return 200 + + @staticmethod + def json(): + return { + "manifests": [ + {"@id": "https://collection.edu/iiif/p15795coll29:28/manifest.json"} + ] + } + + +class MockIIIFCollectionV3Response: + @property + def status_code(self): + return 200 + + @staticmethod + def json(): + return { + "items": [ + {"id": "https://collection.edu/iiif/p15795coll29:28/manifest.json"} + ] + } + + +class MockIIIFManifestResponse: + @property + def status_code(self): + return 200 + + @staticmethod + def json(): + return { + "@context": "http://iiif.io/api/presentation/3/context.json", + "id": "https://collection.edu/iiif/p15795coll29:28/manifest.json", + "metadata": [ + { + "label": { + "en": ["Source"] + }, + "value": { + "en": ["Rare Books and Special Collections Library"] + }, + }, + {"label": {"en": ["Title (main)"]}, "value": {"en": ["A great title of the Middle East"]}}, + {"label": {"en": ["Title (sub)"]}, "value": {"en": ["Subtitle 1"]}}, + {"label": {"en": ["Title (sub)"]}, "value": {"en": ["Subtitle 2"]}}, + {"label": {"en": ["Date Created"]}, "value": {"en": [["1974"]]}}, + ], + "sequences": [ + { + "canvases": [ + {"images": [{"resource": {"format": "image/jpeg"}}]}, + {"images": [{"resource": {"format": "image/jpeg"}}]}, + ] + } + ], + "description": ["A descriptive phrase", " with further elaboration "], + } + + +@pytest.fixture +def mock_response(monkeypatch): + def mock_get(*args, **kwargs): + if args[0].endswith("v2_collection.json"): + return MockIIIFCollectionV2Response() + if args[0].endswith("v3_collection.json"): + return MockIIIFCollectionV3Response() + if args[0].endswith("manifest.json"): + return MockIIIFManifestResponse() + return + + monkeypatch.setattr(requests, "get", mock_get) + + +@pytest.fixture +def iiif_test_v3_source(): + metadata = { + "fields": { + "context": { + "path": "@context", + "optional": True, + }, # a specified field with one value in the metadata + "description_top": {"path": "description", "optional": True}, + "iiif_format": { + "path": "sequences..format" + }, # a specified field with multiple values in the metadata + "profile": {"path": "sequences..profile"}, # a missing required field + "thumbnail": { + "path": "thumbnail..@id", + "optional": True, + }, # missing optional field + } + } + return IiifV3JsonSource( + collection_url="http://iiif_v3_collection.json", metadata=metadata + ) + + +def test_IiifJsonSource_initial(iiif_test_v3_source, mock_response): + assert len(iiif_test_v3_source._manifest_urls) == 0 + + +def test_IiifJsonSource_get_schema(iiif_test_v3_source, mock_response): + iiif_test_v3_source._get_schema() + assert ( + iiif_test_v3_source._manifest_urls[0] + == "https://collection.edu/iiif/p15795coll29:28/manifest.json" + ) + + +def test_IiifJsonSource_read(iiif_test_v3_source, mock_response): + iiif_df = iiif_test_v3_source.read() + test_columns = [ + "context", + "description_top", + "iiif_format", + "source", + "title-main", + "title-sub", + ] + assert all([a == b for a, b in zip(iiif_df.columns, test_columns)]) + + +def test_IiifJsonSource_df(iiif_test_v3_source, mock_response): + iiif_df = iiif_test_v3_source.read() + test_df = pd.DataFrame( + [ + { + "context": "http://iiif.io/api/presentation/3/context.json", + "description_top": ["A descriptive phrase", "with further elaboration"], + "iiif_format": ["image/jpeg", "image/jpeg"], + "source": ["Rare Books and Special Collections Library"], + "title-main": ["A great title of the Middle East"], + "title-sub": ["Subtitle 1", "Subtitle 2"], + "date-created": ["1974"], + } + ] + ) + assert iiif_df.equals(test_df) + + +def test_IiifJsonSource_logging(iiif_test_v3_source, mock_response, caplog): + with caplog.at_level(logging.WARNING): + iiif_test_v3_source.read() + assert ( + "https://collection.edu/iiif/p15795coll29:28/manifest.json missing required field: 'profile'; searched path: 'sequences..profile'" # noqa: E501 + in caplog.text + ) + assert "missing optional field" not in caplog.text + + with caplog.at_level(logging.DEBUG): + iiif_test_v3_source.read() + assert ( + "https://collection.edu/iiif/p15795coll29:28/manifest.json missing optional field: 'thumbnail'; searched path: 'thumbnail..@id'" # noqa: E501 + in caplog.text + ) + + +# def test_wait(iiif_test_v3_source): +# driver = IiifV3JsonSource("https://example.com/iiif/", wait=2) +# assert driver, "IiifJsonSource constructor accepts wait parameter" + + +def test_list_encode(iiif_test_v3_source, mock_response): + iiif_df = iiif_test_v3_source.read() + assert iiif_df["date-created"][0] == ["1974"] From 13dce289da9dbea2147d424f6efd11f856839d5e Mon Sep 17 00:00:00 2001 From: Aaron Collier Date: Tue, 1 Oct 2024 14:53:31 -0700 Subject: [PATCH 2/3] Update tests to match updated paging configuration --- tests/drivers/test_json.py | 3 ++- tests/support/schemas/schema.yaml | 1 + tests/utils/test_partition_url_builder.py | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/drivers/test_json.py b/tests/drivers/test_json.py index 7d439db3..7e246a20 100644 --- a/tests/drivers/test_json.py +++ b/tests/drivers/test_json.py @@ -161,9 +161,10 @@ def test_happy_path_prefetch_urls(requests_mock): record_selector=record_selector, metadata=metadata, paging={ - "pages_url": "https://example.com/collection", + "pages_url": "https://example.com/collection?limit={limit}&offset={offset}", "urls": "data.id", "limit": 3, + "page_data": "data", }, ) diff --git a/tests/support/schemas/schema.yaml b/tests/support/schemas/schema.yaml index 1f61c07c..0da12c0d 100644 --- a/tests/support/schemas/schema.yaml +++ b/tests/support/schemas/schema.yaml @@ -25,6 +25,7 @@ properties: - custom_json - json - iiif_json + - iiif_json_v3 - oai_xml - sequential_csv - xml diff --git a/tests/utils/test_partition_url_builder.py b/tests/utils/test_partition_url_builder.py index 9db23bf4..67acf94a 100644 --- a/tests/utils/test_partition_url_builder.py +++ b/tests/utils/test_partition_url_builder.py @@ -86,9 +86,10 @@ def test_prefetch_page_urls(requests_mock): partitionBuilder = PartitionBuilder( collection_url=collection_url, paging_config={ - "pages_url": "https://example.com/collection", + "pages_url": "https://example.com/collection?limit={limit}&offset={offset}", "urls": "data.id", "limit": 3, + "page_data": "data", }, ) From dece25aa04a68ef6d8220d314063beb6f17708bf Mon Sep 17 00:00:00 2001 From: Aaron Collier Date: Tue, 1 Oct 2024 15:27:24 -0700 Subject: [PATCH 3/3] Update brooklyn config to support new paging options Remove extra list flattening Update tests after removing aub/aco collection WIP Remove debugging print statements --- bin/get | 2 + catalogs/aub.yaml | 109 ++++++------- catalogs/brooklyn.yaml | 3 +- dlme_airflow/drivers/iiif_json_v3.py | 89 ++++------ dlme_airflow/utils/partition_url_builder.py | 27 ++++ tests/data/iiif_v3/collection_items.json | 41 +++++ tests/data/iiif_v3/collection_manifest.json | 20 +++ tests/data/iiif_v3/item_manifest.json | 170 ++++++++++++++++++++ tests/drivers/test_iiif_json_v3.py | 151 +++++++++-------- tests/models/test_collection.py | 16 +- tests/tasks/test_archive.py | 22 +-- tests/tasks/test_index.py | 2 +- tests/utils/test_dataframe.py | 4 +- 13 files changed, 436 insertions(+), 220 deletions(-) create mode 100644 tests/data/iiif_v3/collection_items.json create mode 100644 tests/data/iiif_v3/collection_manifest.json create mode 100644 tests/data/iiif_v3/item_manifest.json diff --git a/bin/get b/bin/get index e374aa1a..c579edaf 100755 --- a/bin/get +++ b/bin/get @@ -19,6 +19,8 @@ def main(opts): if collection is None: sys.exit(f'💥 Provider "{opts.provider}" does not have a collection "{opts.collection}"') + print(f"Harvesting collection: {opts.collection}") + # set driver record limit if it is allowed by the driver if opts.limit: if hasattr(collection.catalog, 'record_limit'): diff --git a/catalogs/aub.yaml b/catalogs/aub.yaml index 1db002fe..6cbbfc95 100644 --- a/catalogs/aub.yaml +++ b/catalogs/aub.yaml @@ -3,23 +3,6 @@ metadata: data_path: aub schedule: "30 13 15 Jan,Apr,Jul,Oct *" sources: - aco: - driver: oai_xml - args: - collection_url: https://libraries.aub.edu.lb/xtf/oai - metadata_prefix: oai_dc - set: "aco" - allow_expiration: true - full_harvest: true - metadata: - data_path: aub/aco - config: aub - fields: - id: - path: "//header:identifier" - namespace: - header: "http://www.openarchives.org/OAI/2.0/" - optional: true aladab: driver: iiif_json_v3 args: @@ -30,7 +13,7 @@ sources: page_fields: - id - thumbnail.id - limit: 1000 + limit: 1010 metadata: data_path: aub/aladab config: aub @@ -38,70 +21,70 @@ sources: id: path: "id" postcards: - driver: oai_xml + driver: iiif_json_v3 args: - collection_url: https://libraries.aub.edu.lb/xtf/oai - metadata_prefix: oai_dc - set: "postcards" - allow_expiration: true - full_harvest: true + collection_url: https://libraries.aub.edu.lb/iiifservices/collection/Postcards + paging: + pages_url: https://libraries.aub.edu.lb/iiifservices/collection/Postcards/{offset}/{limit} + page_data: items + page_fields: + - id + - thumbnail.id + limit: 1010 metadata: data_path: aub/postcards config: aub fields: id: - path: "//header:identifier" - namespace: - header: "http://www.openarchives.org/OAI/2.0/" - optional: true + path: "id" posters: - driver: oai_xml + driver: iiif_json_v3 args: - collection_url: https://libraries.aub.edu.lb/xtf/oai - metadata_prefix: oai_dc - set: "posters" - allow_expiration: true - full_harvest: true + collection_url: https://libraries.aub.edu.lb/iiifservices/collection/Posters + paging: + pages_url: https://libraries.aub.edu.lb/iiifservices/collection/Posters/{offset}/{limit} + page_data: items + page_fields: + - id + - thumbnail.id + limit: 1000 metadata: - data_path: aub/posters + data_path: aub/postcards config: aub fields: id: - path: "//header:identifier" - namespace: - header: "http://www.openarchives.org/OAI/2.0/" - optional: true - thamarat_al_funun: - driver: oai_xml + path: "id" + travelbooks: + driver: iiif_json_v3 args: - collection_url: https://libraries.aub.edu.lb/xtf/oai - metadata_prefix: oai_dc - set: "thf" - allow_expiration: true - full_harvest: true + collection_url: https://libraries.aub.edu.lb/iiifservices/collection/Travel%20Books + paging: + pages_url: https://libraries.aub.edu.lb/iiifservices/collection/Travel%20Books/{offset}/{limit} + page_data: items + page_fields: + - id + - thumbnail.id + limit: 1000 metadata: - data_path: aub/thamarat_al_funun + data_path: aub/postcards config: aub fields: id: - path: "//header:identifier" - namespace: - header: "http://www.openarchives.org/OAI/2.0/" - optional: true - travelbooks: - driver: oai_xml + path: "id" + manuscripts: + driver: iiif_json_v3 args: - collection_url: https://libraries.aub.edu.lb/xtf/oai - metadata_prefix: oai_dc - set: "travelbooks" - allow_expiration: true - full_harvest: true + collection_url: https://libraries.aub.edu.lb/iiifservices/collection/Manuscripts + paging: + pages_url: https://libraries.aub.edu.lb/iiifservices/collection/Manuscripts/{offset}/{limit} + page_data: items + page_fields: + - id + - thumbnail.id + limit: 1000 metadata: - data_path: aub/travelbooks + data_path: aub/postcards config: aub fields: id: - path: "//header:identifier" - namespace: - header: "http://www.openarchives.org/OAI/2.0/" - optional: true + path: "id" diff --git a/catalogs/brooklyn.yaml b/catalogs/brooklyn.yaml index f58afa46..7d455347 100644 --- a/catalogs/brooklyn.yaml +++ b/catalogs/brooklyn.yaml @@ -8,9 +8,10 @@ sources: args: collection_url: https://www.brooklynmuseum.org/api/v2/object/ paging: - pages_url: https://www.brooklynmuseum.org/api/v2/collection/5/object + pages_url: https://www.brooklynmuseum.org/api/v2/collection/5/object?limit={limit}&offset={offset} urls: data.id limit: 25 + page_data: data record_selector: "data" api_key: "0IzFpBiUksT8LMVGLUxovj9IR0ltlSH1" metadata: diff --git a/dlme_airflow/drivers/iiif_json_v3.py b/dlme_airflow/drivers/iiif_json_v3.py index b200b7ec..079059e0 100644 --- a/dlme_airflow/drivers/iiif_json_v3.py +++ b/dlme_airflow/drivers/iiif_json_v3.py @@ -22,46 +22,27 @@ def __init__( super(IiifV3JsonSource, self).__init__(metadata=metadata) self.collection_url = collection_url self.paging = paging - self._manifest_urls = [] + self._manifests = [] self._path_expressions = {} self.record_count = 0 self.record_limit = self.metadata.get("record_limit") + self.partition_builder = None - - def _open_collection(self): - self._manifest_urls = self._get_manifest_urls() - - def _get_manifest_urls(self): if self.paging: - return PartitionBuilder(self.collection_url, self.paging).urls() - else: - return self._get_manifest_urls_from_items() + self.partition_builder = PartitionBuilder(self.collection_url, self.paging) - def _get_manifest_urls_from_items(self): - resp = self._get(self.collection_url) - collection_result = resp.json() - urls = [] - if "items" in collection_result: # IIIF v3 - manifests = collection_result["items"] - else: - raise Exception( - f"Unknown collection manifest format: {self.collection_url}" - ) + def _open_collection(self): + self._manifests = self._get_manifests() - for manifest in manifests: - if "@id" in manifest: - url = manifest["@id"] # valid in IIIF v2 or v3 - elif "id" in manifest: - url = manifest["id"] # valid in IIIF v3 only - else: - raise Exception(f"Unknown URL in manifest: {manifest}") - urls.append(url) - - return urls + + def _get_manifests(self): + if self.paging: + return self.partition_builder.records() - def _open_manifest(self, manifest_url: str) -> Optional[dict]: + def _open_manifest(self, manifest: dict) -> Optional[dict]: + manifest_url = manifest["id"] resp = self._get(manifest_url) if resp.status_code == 200: manifest_result = resp.json() @@ -77,6 +58,9 @@ def _open_manifest(self, manifest_url: str) -> Optional[dict]: record.update( self._extract_manifest_metadata(manifest_result.get("metadata", [])) ) + + # Handles the thumbnail field provided in the collection manifest + record.update({"thumbnail": manifest.get("thumbnail")}) return record def _extract_specified_fields(self, iiif_manifest: dict) -> dict: @@ -106,7 +90,7 @@ def _optional_field_warning(self, id, field, expression, optional): if optional is True: logging.debug(f"{id} missing optional field: '{field}'; searched path: '{expression}'") return - + logging.warning(f"{id} missing required field: '{field}'; searched path: '{expression}'") @@ -116,33 +100,28 @@ def _extract_manifest_metadata( output: dict[str, list[str]] = {} for row in iiif_manifest_metadata: - for key in row.get("label").keys(): - name = ( - row.get("label")[key][0] - .replace(" ", "-") - .lower() - .replace("(", "") - .replace(")", "") - ) - # initialize or append to output[name] based on whether we've seen the label - metadata_value = row.get("value")[key] - if not metadata_value: - continue - - if name in output: - output.update({name: _flatten_list([output[name], metadata_value])}) - else: - output[name] = metadata_value - - # flatten any nested lists into a single list - return {k: list(_flatten_list(v)) for (k, v) in output.items()} + (label, values) = self._extract_metadata_for_row(row) + output.setdefault(label, []).extend(values) + + return output + + def _extract_metadata_for_row(self, row): + values = [] + lang = next(iter(row.get("label"))) + label = row.get("label")[lang][0].replace(" ", "-").lower().replace("(", "").replace(")", "") + for key in row.get("label").keys(): + # initialize or append to output[name] based on whether we've seen the label + values += row.get("value")[key] + + return label, values + def _get_partition(self, i) -> pd.DataFrame: # if we are over the defined limit return an empty DataFrame right away if self.record_limit is not None and self.record_count > self.record_limit: return pd.DataFrame() - result = self._open_manifest(self._manifest_urls[i]) + result = self._open_manifest(self._manifests[i]) # If the dictionary has AT LEAST one value that is not None return a # DataFrame with the keys as columns, and the values as a row. @@ -165,7 +144,7 @@ def _get_schema(self): datashape=None, dtype=self.dtype, shape=None, - npartitions=len(self._manifest_urls), + npartitions=len(self._manifests), extra_metadata={}, ) @@ -188,10 +167,10 @@ def _stringify_and_strip_if_list(record) -> list[str]: result_list = [] for data in record: result_list.append(_stringify_and_strip_if_list(data)) - + if len(result_list) == 1: return result_list[0] - + return result_list diff --git a/dlme_airflow/utils/partition_url_builder.py b/dlme_airflow/utils/partition_url_builder.py index 55478624..d29535db 100644 --- a/dlme_airflow/utils/partition_url_builder.py +++ b/dlme_airflow/utils/partition_url_builder.py @@ -17,6 +17,7 @@ def __init__( self.paging_config = paging_config self.provider_data = None self.api_key = api_key + self.data = [] def urls(self): if self.paging_config.get("pages_url"): @@ -30,6 +31,10 @@ def urls(self): return [] + def records(self): + if self.paging_config.get("pages_url"): + return self._prefetch_page_data() + def _urls_from_provider(self): urls = [self.collection_url] expression = jsonpath_ng.parse(self.paging_config["urls"]) @@ -71,12 +76,29 @@ def _prefetch_page_urls(self): harvested = len(data) ids += self._extract_ids(data) + if self.paging_config.get("page_fields"): + self.data += self._extract_data(data) if harvested < self.paging_config["limit"]: break return ids + def _prefetch_page_data(self): + offset = 0 + harvested = 0 + data = [] + while True: + api_endpoint = self.paging_config['pages_url'].format(offset=offset,limit=self.paging_config['limit']) + data += self._fetch_provider_data(api_endpoint)[self.paging_config['page_data']] + offset += self.paging_config["limit"] + harvested = len(data) + + if harvested < self.paging_config["limit"]: + break + + return data + def _fetch_provider_data(self, url): headers = {} if self.api_key: @@ -89,6 +111,11 @@ def _fetch_provider_data(self, url): def _extract_ids(self, data): return [self._format_id(i['id']) for i in data] + def _extract_data(self, data): + return [{ + self._format_id(i['id']): i['thumbnail'][0]['id'] + } for i in data] + def _format_id(self, id): if validators.url(id): return id diff --git a/tests/data/iiif_v3/collection_items.json b/tests/data/iiif_v3/collection_items.json new file mode 100644 index 00000000..3332b72a --- /dev/null +++ b/tests/data/iiif_v3/collection_items.json @@ -0,0 +1,41 @@ +{ + "@context" : [ "http://www.w3.org/ns/anno.jsonld", "http://iiif.io/api/presentation/3/context.json" ], + "id" : "https://libraries.aub.edu.lb/iiifservices/collection/Posters/0/166", + "type" : "Collection", + "label" : { + "en" : [ "Ranged collection for Posters." ] + }, + "requiredStatement" : { + "label" : { + "en" : [ "Attribution" ] + }, + "value" : { + "en" : [ "AUB University Libraries." ] + } + }, + "items" : [ { + "id" : "https://libraries.aub.edu.lb/iiifservices/item/ark86073b3hd1k/manifest", + "type" : "Manifest", + "label" : { + "en" : [ "Madi" ], + "ar" : [ "ماضي" ] + }, + "thumbnail" : [ { + "id" : "https://libraries.aub.edu.lb/iiifservices/files/posters/ark86073b3hd1k/thumb.jpg", + "type" : "Image", + "format" : "image/jpeg" + } ] + }, { + "id" : "https://libraries.aub.edu.lb/iiifservices/item/ark86073b3705g/manifest", + "type" : "Manifest", + "label" : { + "en" : [ "H. Madi" ], + "ar" : [ "ماضي" ] + }, + "thumbnail" : [ { + "id" : "https://libraries.aub.edu.lb/iiifservices/files/posters/ark86073b3705g/thumb.jpg", + "type" : "Image", + "format" : "image/jpeg" + } ] + } ] +} diff --git a/tests/data/iiif_v3/collection_manifest.json b/tests/data/iiif_v3/collection_manifest.json new file mode 100644 index 00000000..a951a6ba --- /dev/null +++ b/tests/data/iiif_v3/collection_manifest.json @@ -0,0 +1,20 @@ +{ + "@context" : [ "http://www.w3.org/ns/anno.jsonld", "http://iiif.io/api/presentation/3/context.json" ], + "id" : "https://libraries.aub.edu.lb/iiifservices/collection/Posters", + "type" : "Collection", + "label" : { + "en" : [ "Collection for Posters (166 items)." ] + }, + "requiredStatement" : { + "label" : { + "en" : [ "Attribution" ] + }, + "value" : { + "en" : [ "AUB University Libraries." ] + } + }, + "items" : [ { + "id" : "https://libraries.aub.edu.lb/iiifservices/collection/Posters/0/166", + "type" : "Collection" + } ] +} \ No newline at end of file diff --git a/tests/data/iiif_v3/item_manifest.json b/tests/data/iiif_v3/item_manifest.json new file mode 100644 index 00000000..362ec326 --- /dev/null +++ b/tests/data/iiif_v3/item_manifest.json @@ -0,0 +1,170 @@ +{ + "@context" : [ "http://www.w3.org/ns/anno.jsonld", "http://iiif.io/api/presentation/3/context.json" ], + "id" : "https://libraries.aub.edu.lb/iiifservices/item/ark86073b3x34b/manifest", + "type" : "Manifest", + "label" : { + "en" : [ "Nicolas Sursock Museum 21st Autumn Salon" ] + }, + "logo" : [ { + "id" : "https://libraries.aub.edu.lb/iiifservices/files/aub_libraries_logo.jpg", + "type" : "Image", + "format" : "image/jpeg" + } ], + "homepage" : [ { + "id" : "https://libraries.aub.edu.lb/blacklight/catalog/ark86073b3x34b", + "type" : "Text", + "format" : "text/html", + "label" : { + "en" : [ "Homepage" ] + } + } ], + "requiredStatement" : { + "label" : { + "en" : [ "Attribution" ] + }, + "value" : { + "en" : [ "AUB University Libraries." ] + } + }, + "metadata" : [ { + "label" : { + "en" : [ "Title" ] + }, + "value" : { + "en" : [ "Nicolas Sursock Museum 21st Autumn Salon" ] + } + }, { + "label" : { + "ar" : [ "Title" ] + }, + "value" : { + "ar" : [ "متحف نقولا سرسق : معرض الخريف الحادي والعشرون" ] + } + }, { + "label" : { + "fr" : [ "Title" ] + }, + "value" : { + "fr" : [ "Musée Nicolas Sursock XXIe salon d'automne" ] + } + }, { + "label" : { + "none" : [ "Identifier" ] + }, + "value" : { + "none" : [ "ark86073b3x34b" ] + } + }, { + "label" : { + "en" : [ "Language" ] + }, + "value" : { + "en" : [ "Arabic" ] + } + }, { + "label" : { + "en" : [ "Date" ] + }, + "value" : { + "en" : [ "1998" ] + } + }, { + "label" : { + "en" : [ "Author(s)" ] + }, + "value" : { + "en" : [ "Theo" ] + } + }, { + "label" : { + "en" : [ "Description(s)" ] + }, + "value" : { + "en" : [ "Musée Nicolas Sursock XXIe salon d'automne 28 Novembre 1997 - 10 January 1998. Painting reproduced on poster by Theo." ] + } + }, { + "label" : { + "en" : [ "Extent" ] + }, + "value" : { + "en" : [ "printed on paper : 58 x 44 cm" ] + } + }, { + "label" : { + "en" : [ "Subject(s)" ] + }, + "value" : { + "en" : [ "Lebanese painters", "Art Exhibitions -- Lebanon" ] + } + }, { + "label" : { + "en" : [ "Collection" ] + }, + "value" : { + "en" : [ "Sursock Museum Collection", "American University of Beirut Art posters Collection" ] + } + }, { + "label" : { + "en" : [ "Rights" ] + }, + "value" : { + "en" : [ "AUB University Libraries Digital Collections are available under a Creative Commons Attribution-Noncommercial-NoDerivatives 4.0 International License under CC BY-NC-ND 4.0. Except where otherwise stated, anyone is free to download and share works under this license as long as they give credit for the original creation, the shared work is not changed and not used for commercial purposes. Attribution should be given to \"AUB University Libraries.\"" ] + } + }, { + "label" : { + "en" : [ "Homepage" ] + }, + "value" : { + "en" : [ "View on AUB Digital Collections" ] + } + }, { + "label" : { + "en" : [ "Usage Terms" ] + }, + "value" : { + "en" : [ "View Usage Terms on AUB Digital Collections" ] + } + } ], + "items" : [ { + "id" : "https://libraries.aub.edu.lb/blacklight/catalog/ark86073b3x34b/cps_2001d_0147", + "type" : "Canvas", + "label" : { + "en" : [ "1" ] + }, + "height" : 6000, + "width" : 4515, + "items" : [ { + "type" : "AnnotationPage", + "id" : "https://libraries.aub.edu.lb/blacklight/catalog/ark86073b3x34b/cps_2001d_0147/ap", + "items" : [ { + "id" : "https://libraries.aub.edu.lb/blacklight/catalog/ark86073b3x34b/cps_2001d_0147/image", + "type" : "Annotation", + "motivation" : "painting", + "target" : "https://libraries.aub.edu.lb/blacklight/catalog/ark86073b3x34b/cps_2001d_0147", + "body" : { + "id" : "https://lib-iiifmedia.aub.edu.lb/iiif/2/posters%2Fark86073b3x34b%2Fps_2001d_0147.jpg/full/full/0/default.jpg", + "type" : "Image", + "format" : "image/jpeg", + "height" : 6000, + "width" : 4515, + "service" : [ { + "id" : "https://lib-iiifmedia.aub.edu.lb/iiif/2/posters%2Fark86073b3x34b%2Fps_2001d_0147.jpg", + "type" : "ImageService2", + "profile" : "http://iiif.io/api/image/2/level2.json" + } ] + } + } ] + } ], + "thumbnail" : [ { + "id" : "https://lib-iiifmedia.aub.edu.lb/iiif/2/posters%2Fark86073b3x34b%2Fps_2001d_0147.jpg/full/91,/0/default.jpg", + "type" : "Image", + "format" : "image/jpeg", + "service" : [ { + "id" : "https://lib-iiifmedia.aub.edu.lb/iiif/2/posters%2Fark86073b3x34b%2Fps_2001d_0147.jpg", + "type" : "ImageService2", + "profile" : "http://iiif.io/api/image/2/level2.json" + } ] + } ] + } ], + "viewingDirection" : "right-to-left" +} diff --git a/tests/drivers/test_iiif_json_v3.py b/tests/drivers/test_iiif_json_v3.py index 89a4b6e2..9a04d990 100644 --- a/tests/drivers/test_iiif_json_v3.py +++ b/tests/drivers/test_iiif_json_v3.py @@ -1,7 +1,7 @@ +import json import logging import pytest import requests -import pandas as pd from dlme_airflow.drivers.iiif_json_v3 import IiifV3JsonSource @@ -29,12 +29,9 @@ def status_code(self): @staticmethod def json(): - return { - "items": [ - {"id": "https://collection.edu/iiif/p15795coll29:28/manifest.json"} - ] - } - + with open("tests/data/iiif_v3/collection_items.json") as f: + data = json.load(f) + return data class MockIIIFManifestResponse: @property @@ -43,33 +40,9 @@ def status_code(self): @staticmethod def json(): - return { - "@context": "http://iiif.io/api/presentation/3/context.json", - "id": "https://collection.edu/iiif/p15795coll29:28/manifest.json", - "metadata": [ - { - "label": { - "en": ["Source"] - }, - "value": { - "en": ["Rare Books and Special Collections Library"] - }, - }, - {"label": {"en": ["Title (main)"]}, "value": {"en": ["A great title of the Middle East"]}}, - {"label": {"en": ["Title (sub)"]}, "value": {"en": ["Subtitle 1"]}}, - {"label": {"en": ["Title (sub)"]}, "value": {"en": ["Subtitle 2"]}}, - {"label": {"en": ["Date Created"]}, "value": {"en": [["1974"]]}}, - ], - "sequences": [ - { - "canvases": [ - {"images": [{"resource": {"format": "image/jpeg"}}]}, - {"images": [{"resource": {"format": "image/jpeg"}}]}, - ] - } - ], - "description": ["A descriptive phrase", " with further elaboration "], - } + with open("tests/data/iiif_v3/item_manifest.json") as f: + data = json.load(f) + return data @pytest.fixture @@ -77,10 +50,12 @@ def mock_response(monkeypatch): def mock_get(*args, **kwargs): if args[0].endswith("v2_collection.json"): return MockIIIFCollectionV2Response() - if args[0].endswith("v3_collection.json"): - return MockIIIFCollectionV3Response() - if args[0].endswith("manifest.json"): + # if args[0].endswith("v3_collection.json"): + # return MockIIIFCollectionV3Response() + if args[0].endswith("manifest"): return MockIIIFManifestResponse() + if "iiifservices/collection/al-Adab" in args[0]: + return MockIIIFCollectionV3Response() return monkeypatch.setattr(requests, "get", mock_get) @@ -90,82 +65,100 @@ def mock_get(*args, **kwargs): def iiif_test_v3_source(): metadata = { "fields": { - "context": { - "path": "@context", - "optional": True, + "id": { + "path": "id", }, # a specified field with one value in the metadata - "description_top": {"path": "description", "optional": True}, - "iiif_format": { - "path": "sequences..format" - }, # a specified field with multiple values in the metadata - "profile": {"path": "sequences..profile"}, # a missing required field - "thumbnail": { - "path": "thumbnail..@id", - "optional": True, - }, # missing optional field } } + paging = { + "pages_url": "https://iiif_v3_collection/iiifservices/collection/al-Adab/{offset}/{limit}", + "page_data": "items", + "limit": 1000 + } return IiifV3JsonSource( - collection_url="http://iiif_v3_collection.json", metadata=metadata + collection_url="https://iiif_v3_collection/iiifservices/collection/Posters", + paging=paging, + metadata=metadata ) def test_IiifJsonSource_initial(iiif_test_v3_source, mock_response): - assert len(iiif_test_v3_source._manifest_urls) == 0 + assert len(iiif_test_v3_source._manifests) == 0 def test_IiifJsonSource_get_schema(iiif_test_v3_source, mock_response): iiif_test_v3_source._get_schema() + assert len(iiif_test_v3_source._manifests) == 2 assert ( - iiif_test_v3_source._manifest_urls[0] - == "https://collection.edu/iiif/p15795coll29:28/manifest.json" + iiif_test_v3_source._manifests[0]["id"] + == "https://libraries.aub.edu.lb/iiifservices/item/ark86073b3hd1k/manifest" ) def test_IiifJsonSource_read(iiif_test_v3_source, mock_response): iiif_df = iiif_test_v3_source.read() + print(f"Columns: {iiif_df.columns}") test_columns = [ - "context", - "description_top", - "iiif_format", - "source", - "title-main", - "title-sub", + "id", + "title", + "identifier", + "language", + "date", + "authors", + "descriptions", + "extent", + "subjects", + "collection", + "rights", ] assert all([a == b for a, b in zip(iiif_df.columns, test_columns)]) def test_IiifJsonSource_df(iiif_test_v3_source, mock_response): iiif_df = iiif_test_v3_source.read() - test_df = pd.DataFrame( - [ - { - "context": "http://iiif.io/api/presentation/3/context.json", - "description_top": ["A descriptive phrase", "with further elaboration"], - "iiif_format": ["image/jpeg", "image/jpeg"], - "source": ["Rare Books and Special Collections Library"], - "title-main": ["A great title of the Middle East"], - "title-sub": ["Subtitle 1", "Subtitle 2"], - "date-created": ["1974"], - } - ] - ) - assert iiif_df.equals(test_df) + assert len(iiif_df.get('id')) == 2 + assert iiif_df.get('id').to_list() == ["https://libraries.aub.edu.lb/iiifservices/item/ark86073b3x34b/manifest", "https://libraries.aub.edu.lb/iiifservices/item/ark86073b3x34b/manifest"] -def test_IiifJsonSource_logging(iiif_test_v3_source, mock_response, caplog): +@pytest.fixture +def iiif_test_v3_source_with_profile(): + metadata = { + "fields": { + "thumbnail": { + "path": "thumbnail", + "optional": True, + }, # a specified field with one value in the metadata + "profile": { + "path": "profile", + "optional": False, + }, + } + } + paging = { + "pages_url": "https://iiif_v3_collection/iiifservices/collection/al-Adab/{offset}/{limit}", + "page_data": "items", + "limit": 1000 + } + return IiifV3JsonSource( + collection_url="http://iiif_v3_collection/iiifservices/collection/al-Adab", + paging=paging, + metadata=metadata + ) + +def test_IiifJsonSource_logging(iiif_test_v3_source_with_profile, mock_response, caplog): with caplog.at_level(logging.WARNING): - iiif_test_v3_source.read() + iiif_test_v3_source_with_profile.read() + print(f"CAPLOG: {caplog.text}") assert ( - "https://collection.edu/iiif/p15795coll29:28/manifest.json missing required field: 'profile'; searched path: 'sequences..profile'" # noqa: E501 + "https://libraries.aub.edu.lb/iiifservices/item/ark86073b3x34b/manifest missing required field: 'profile'; searched path: 'profile'" # noqa: E501 in caplog.text ) assert "missing optional field" not in caplog.text with caplog.at_level(logging.DEBUG): - iiif_test_v3_source.read() + iiif_test_v3_source_with_profile.read() assert ( - "https://collection.edu/iiif/p15795coll29:28/manifest.json missing optional field: 'thumbnail'; searched path: 'thumbnail..@id'" # noqa: E501 + "https://libraries.aub.edu.lb/iiifservices/item/ark86073b3x34b/manifest missing optional field: 'thumbnail'; searched path: 'thumbnail'" # noqa: E501 in caplog.text ) @@ -177,4 +170,4 @@ def test_IiifJsonSource_logging(iiif_test_v3_source, mock_response, caplog): def test_list_encode(iiif_test_v3_source, mock_response): iiif_df = iiif_test_v3_source.read() - assert iiif_df["date-created"][0] == ["1974"] + assert iiif_df.get("date").to_list() == [["1998"], ["1998"]] diff --git a/tests/models/test_collection.py b/tests/models/test_collection.py index 8946765a..775e7c05 100644 --- a/tests/models/test_collection.py +++ b/tests/models/test_collection.py @@ -7,28 +7,28 @@ def test_Collection(): provider = Provider("aub") - collection = Collection(provider, "aco") - assert collection.label() == "aub_aco" - assert collection.data_path() == "aub/aco" - assert collection.intermediate_representation_location() == "output-aub-aco.ndjson" + collection = Collection(provider, "aladab") + assert collection.label() == "aub_aladab" + assert collection.data_path() == "aub/aladab" + assert collection.intermediate_representation_location() == "output-aub-aladab.ndjson" def test_datafile(): provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") working_data_path = os.path.abspath("working") assert collection.datafile("csv") == os.path.join( - working_data_path, "aub", "aco", "data.csv" + working_data_path, "aub", "aladab", "data.csv" ) assert collection.datafile("json") == os.path.join( - working_data_path, "aub", "aco", "data.json" + working_data_path, "aub", "aladab", "data.json" ) def test_Provider_NotSupported(): with pytest.raises(Exception) as error: provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") collection.datafile("xml") assert str(error.value) == "Unsupported data output format: xml" diff --git a/tests/tasks/test_archive.py b/tests/tasks/test_archive.py index 8b306333..3710174a 100644 --- a/tests/tasks/test_archive.py +++ b/tests/tasks/test_archive.py @@ -9,7 +9,7 @@ test_working = Path("test-working") test_archive = Path("test-archive") -test_dir = test_working / "aub" / "aco" +test_dir = test_working / "aub" / "aladab" test_csv = test_dir / "data.csv" test_json = test_dir / "data.json" test_now = datetime.datetime(2023, 3, 13, 18, 6, 31) @@ -56,13 +56,13 @@ def setup(monkeypatch): def test_archive_dir(): provider = Provider("aub") - collection = Collection(provider, "aco") - assert collection.archive_dir().endswith("archive/aub/aco") + collection = Collection(provider, "aladab") + assert collection.archive_dir().endswith("archive/aub/aladab") def test_csv_with_data(setup, mock_collection_datafile, mock_now): provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") fh = test_csv.open("w") fh.write("id,author,title\n") @@ -73,7 +73,7 @@ def test_csv_with_data(setup, mock_collection_datafile, mock_now): assert result is not None assert result["csv"].endswith( - "test-archive/aub/aco/data-20230313180631.csv" + "test-archive/aub/aladab/data-20230313180631.csv" ), "returned CSV archive filename" assert Path(result["csv"]).is_file(), "archived file exists" assert test_csv.is_file(), "original data file should still be there" @@ -81,7 +81,7 @@ def test_csv_with_data(setup, mock_collection_datafile, mock_now): def test_empty_csv(setup, mock_collection_datafile, mock_now): provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") test_csv.touch() result = archive_collection(collection=collection) @@ -91,7 +91,7 @@ def test_empty_csv(setup, mock_collection_datafile, mock_now): def test_csv_with_header(setup, mock_collection_datafile, mock_now): provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") test_csv.open("w").write("id,author,title\n") @@ -103,7 +103,7 @@ def test_csv_with_header(setup, mock_collection_datafile, mock_now): # mock_now not used here since we want to call at two different times def test_identical_csv(setup, mock_collection_datafile): provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") fh = test_csv.open("w") fh.write("id,author,title\n") @@ -132,7 +132,7 @@ def test_identical_csv(setup, mock_collection_datafile): def test_json_with_data(setup, mock_collection_datafile, mock_now): provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") fh = test_json.open("w") fh.write("""{"id": 1, "title": "Maṭnawīye Ma'nawī"}\n""") @@ -142,7 +142,7 @@ def test_json_with_data(setup, mock_collection_datafile, mock_now): assert len(result) != 0 assert result["json"].endswith( - "test-archive/aub/aco/data-20230313180631.json" + "test-archive/aub/aladab/data-20230313180631.json" ), "returned archived json filename" assert Path(result["json"]).is_file(), "archived file exists" assert test_json.is_file(), "original data file should still be there" @@ -150,7 +150,7 @@ def test_json_with_data(setup, mock_collection_datafile, mock_now): def test_empty_json(setup, mock_collection_datafile, mock_now): provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") test_json.touch() result = archive_collection(collection=collection) diff --git a/tests/tasks/test_index.py b/tests/tasks/test_index.py index 42baa9aa..f0fc557e 100644 --- a/tests/tasks/test_index.py +++ b/tests/tasks/test_index.py @@ -24,6 +24,6 @@ def mock_post(self, **data): def test_index_collection(mock_request): provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") params = {"collection": collection} assert index_collection(**params) == "Harvest successfully initiated" diff --git a/tests/utils/test_dataframe.py b/tests/utils/test_dataframe.py index cc345eac..e647f406 100644 --- a/tests/utils/test_dataframe.py +++ b/tests/utils/test_dataframe.py @@ -22,13 +22,13 @@ def mock_datafile(_, format): def test_dataframe_from_csv_file(mock_collection_datafile): provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") assert dataframe_from_file(collection)["id"].count() == 4 def test_dataframe_from_json_file(mock_collection_datafile): provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") assert dataframe_from_file(collection, "json")["id"].count() == 2