From 51ba73471fd485295b1850bc1de8a65568f8deda Mon Sep 17 00:00:00 2001 From: Aaron Collier Date: Wed, 8 Jan 2025 11:02:42 -0800 Subject: [PATCH] Add IIIF_v3 Driver to support hashed metadata (#528) * Add IIIF_v3 Driver to support hashed metadata Refactoring Refactoring to improve complexity * Update tests to match updated paging configuration * Update brooklyn config to support new paging options Remove extra list flattening Update tests after removing aub/aco collection WIP Remove debugging print statements --- bin/get | 2 + catalogs/aub.yaml | 127 ++++++-------- catalogs/brooklyn.yaml | 3 +- dlme_airflow/drivers/__init__.py | 2 + dlme_airflow/drivers/iiif_json_v3.py | 182 ++++++++++++++++++++ dlme_airflow/utils/partition_url_builder.py | 44 ++++- tests/data/iiif_v3/collection_items.json | 41 +++++ tests/data/iiif_v3/collection_manifest.json | 20 +++ tests/data/iiif_v3/item_manifest.json | 170 ++++++++++++++++++ tests/drivers/test_iiif_json_v3.py | 173 +++++++++++++++++++ tests/drivers/test_json.py | 3 +- tests/models/test_collection.py | 16 +- tests/support/schemas/schema.yaml | 1 + tests/tasks/test_archive.py | 22 +-- tests/tasks/test_index.py | 2 +- tests/utils/test_dataframe.py | 4 +- tests/utils/test_partition_url_builder.py | 3 +- 17 files changed, 713 insertions(+), 102 deletions(-) create mode 100644 dlme_airflow/drivers/iiif_json_v3.py create mode 100644 tests/data/iiif_v3/collection_items.json create mode 100644 tests/data/iiif_v3/collection_manifest.json create mode 100644 tests/data/iiif_v3/item_manifest.json create mode 100644 tests/drivers/test_iiif_json_v3.py diff --git a/bin/get b/bin/get index e374aa1a..c579edaf 100755 --- a/bin/get +++ b/bin/get @@ -19,6 +19,8 @@ def main(opts): if collection is None: sys.exit(f'💥 Provider "{opts.provider}" does not have a collection "{opts.collection}"') + print(f"Harvesting collection: {opts.collection}") + # set driver record limit if it is allowed by the driver if opts.limit: if hasattr(collection.catalog, 'record_limit'): diff --git a/catalogs/aub.yaml b/catalogs/aub.yaml index a10b66b1..6cbbfc95 100644 --- a/catalogs/aub.yaml +++ b/catalogs/aub.yaml @@ -3,105 +3,88 @@ metadata: data_path: aub schedule: "30 13 15 Jan,Apr,Jul,Oct *" sources: - aco: - driver: oai_xml - args: - collection_url: https://libraries.aub.edu.lb/xtf/oai - metadata_prefix: oai_dc - set: "aco" - allow_expiration: true - full_harvest: true - metadata: - data_path: aub/aco - config: aub - fields: - id: - path: "//header:identifier" - namespace: - header: "http://www.openarchives.org/OAI/2.0/" - optional: true aladab: - driver: oai_xml + driver: iiif_json_v3 args: - collection_url: https://libraries.aub.edu.lb/xtf/oai - metadata_prefix: oai_dc - set: "aladab" - allow_expiration: true - full_harvest: true + collection_url: https://libraries.aub.edu.lb/iiifservices/collection/al-Adab + paging: + pages_url: https://libraries.aub.edu.lb/iiifservices/collection/al-Adab/{offset}/{limit} + page_data: items + page_fields: + - id + - thumbnail.id + limit: 1010 metadata: data_path: aub/aladab config: aub fields: id: - path: "//header:identifier" - namespace: - header: "http://www.openarchives.org/OAI/2.0/" - optional: true + path: "id" postcards: - driver: oai_xml + driver: iiif_json_v3 args: - collection_url: https://libraries.aub.edu.lb/xtf/oai - metadata_prefix: oai_dc - set: "postcards" - allow_expiration: true - full_harvest: true + collection_url: https://libraries.aub.edu.lb/iiifservices/collection/Postcards + paging: + pages_url: https://libraries.aub.edu.lb/iiifservices/collection/Postcards/{offset}/{limit} + page_data: items + page_fields: + - id + - thumbnail.id + limit: 1010 metadata: data_path: aub/postcards config: aub fields: id: - path: "//header:identifier" - namespace: - header: "http://www.openarchives.org/OAI/2.0/" - optional: true + path: "id" posters: - driver: oai_xml + driver: iiif_json_v3 args: - collection_url: https://libraries.aub.edu.lb/xtf/oai - metadata_prefix: oai_dc - set: "posters" - allow_expiration: true - full_harvest: true + collection_url: https://libraries.aub.edu.lb/iiifservices/collection/Posters + paging: + pages_url: https://libraries.aub.edu.lb/iiifservices/collection/Posters/{offset}/{limit} + page_data: items + page_fields: + - id + - thumbnail.id + limit: 1000 metadata: - data_path: aub/posters + data_path: aub/postcards config: aub fields: id: - path: "//header:identifier" - namespace: - header: "http://www.openarchives.org/OAI/2.0/" - optional: true - thamarat_al_funun: - driver: oai_xml + path: "id" + travelbooks: + driver: iiif_json_v3 args: - collection_url: https://libraries.aub.edu.lb/xtf/oai - metadata_prefix: oai_dc - set: "thf" - allow_expiration: true - full_harvest: true + collection_url: https://libraries.aub.edu.lb/iiifservices/collection/Travel%20Books + paging: + pages_url: https://libraries.aub.edu.lb/iiifservices/collection/Travel%20Books/{offset}/{limit} + page_data: items + page_fields: + - id + - thumbnail.id + limit: 1000 metadata: - data_path: aub/thamarat_al_funun + data_path: aub/postcards config: aub fields: id: - path: "//header:identifier" - namespace: - header: "http://www.openarchives.org/OAI/2.0/" - optional: true - travelbooks: - driver: oai_xml + path: "id" + manuscripts: + driver: iiif_json_v3 args: - collection_url: https://libraries.aub.edu.lb/xtf/oai - metadata_prefix: oai_dc - set: "travelbooks" - allow_expiration: true - full_harvest: true + collection_url: https://libraries.aub.edu.lb/iiifservices/collection/Manuscripts + paging: + pages_url: https://libraries.aub.edu.lb/iiifservices/collection/Manuscripts/{offset}/{limit} + page_data: items + page_fields: + - id + - thumbnail.id + limit: 1000 metadata: - data_path: aub/travelbooks + data_path: aub/postcards config: aub fields: id: - path: "//header:identifier" - namespace: - header: "http://www.openarchives.org/OAI/2.0/" - optional: true + path: "id" diff --git a/catalogs/brooklyn.yaml b/catalogs/brooklyn.yaml index f58afa46..7d455347 100644 --- a/catalogs/brooklyn.yaml +++ b/catalogs/brooklyn.yaml @@ -8,9 +8,10 @@ sources: args: collection_url: https://www.brooklynmuseum.org/api/v2/object/ paging: - pages_url: https://www.brooklynmuseum.org/api/v2/collection/5/object + pages_url: https://www.brooklynmuseum.org/api/v2/collection/5/object?limit={limit}&offset={offset} urls: data.id limit: 25 + page_data: data record_selector: "data" api_key: "0IzFpBiUksT8LMVGLUxovj9IR0ltlSH1" metadata: diff --git a/dlme_airflow/drivers/__init__.py b/dlme_airflow/drivers/__init__.py index c2b34bfd..1cb416e1 100644 --- a/dlme_airflow/drivers/__init__.py +++ b/dlme_airflow/drivers/__init__.py @@ -1,6 +1,7 @@ import intake from dlme_airflow.drivers.iiif_json import IiifJsonSource +from dlme_airflow.drivers.iiif_json_v3 import IiifV3JsonSource from dlme_airflow.drivers.oai_xml import OaiXmlSource from dlme_airflow.drivers.xml import XmlSource from dlme_airflow.drivers.sequential_csv import SequentialCsvSource @@ -9,6 +10,7 @@ def register_drivers(): intake.source.register_driver("iiif_json", IiifJsonSource) + intake.source.register_driver("iiif_json_v3", IiifV3JsonSource) intake.source.register_driver("oai_xml", OaiXmlSource) intake.source.register_driver("xml", XmlSource) intake.source.register_driver("sequential_csv", SequentialCsvSource) diff --git a/dlme_airflow/drivers/iiif_json_v3.py b/dlme_airflow/drivers/iiif_json_v3.py new file mode 100644 index 00000000..079059e0 --- /dev/null +++ b/dlme_airflow/drivers/iiif_json_v3.py @@ -0,0 +1,182 @@ +import logging +import intake +import requests +import jsonpath_ng +import pandas as pd +from typing import Any, Optional, Generator +from dlme_airflow.utils.partition_url_builder import PartitionBuilder + + +class IiifV3JsonSource(intake.source.base.DataSource): + container = "dataframe" + name = "iiif_json_v3" + version = "0.0.2" + partition_access = True + + def __init__( + self, + collection_url, + paging=None, + metadata=None + ): + super(IiifV3JsonSource, self).__init__(metadata=metadata) + self.collection_url = collection_url + self.paging = paging + self._manifests = [] + self._path_expressions = {} + self.record_count = 0 + self.record_limit = self.metadata.get("record_limit") + self.partition_builder = None + + if self.paging: + self.partition_builder = PartitionBuilder(self.collection_url, self.paging) + + + def _open_collection(self): + self._manifests = self._get_manifests() + + + def _get_manifests(self): + if self.paging: + return self.partition_builder.records() + + + def _open_manifest(self, manifest: dict) -> Optional[dict]: + manifest_url = manifest["id"] + resp = self._get(manifest_url) + if resp.status_code == 200: + manifest_result = resp.json() + else: + logging.error( + f"got {resp.status_code} when fetching manifest {manifest_url}" + ) + return None + + record = self._extract_specified_fields(manifest_result) + + # Handles metadata in IIIF manifest + record.update( + self._extract_manifest_metadata(manifest_result.get("metadata", [])) + ) + + # Handles the thumbnail field provided in the collection manifest + record.update({"thumbnail": manifest.get("thumbnail")}) + return record + + def _extract_specified_fields(self, iiif_manifest: dict) -> dict: + output: dict [str, Any] = {} + for name, info in self.metadata.get("fields").items(): + result = self._get_data_for_field(name, iiif_manifest) + + if not result: + self._optional_field_warning(iiif_manifest.get("id"), name, self._path_expressions.get(name), info.get("optional")) + continue + + processed_result = _stringify_and_strip_if_list(result) + + if name in output: + output.update({name: _flatten_list([output[name], processed_result])}) + else: + output[name] = processed_result + + return output + + + def _get_data_for_field(self, field, manifest): + expression = self._path_expressions.get(field) + return [match.value for match in expression.find(manifest)] + + def _optional_field_warning(self, id, field, expression, optional): + if optional is True: + logging.debug(f"{id} missing optional field: '{field}'; searched path: '{expression}'") + return + + logging.warning(f"{id} missing required field: '{field}'; searched path: '{expression}'") + + + def _extract_manifest_metadata( + self, iiif_manifest_metadata + ) -> dict[str, list[str]]: + output: dict[str, list[str]] = {} + + for row in iiif_manifest_metadata: + (label, values) = self._extract_metadata_for_row(row) + output.setdefault(label, []).extend(values) + + return output + + def _extract_metadata_for_row(self, row): + values = [] + lang = next(iter(row.get("label"))) + label = row.get("label")[lang][0].replace(" ", "-").lower().replace("(", "").replace(")", "") + for key in row.get("label").keys(): + # initialize or append to output[name] based on whether we've seen the label + values += row.get("value")[key] + + return label, values + + + def _get_partition(self, i) -> pd.DataFrame: + # if we are over the defined limit return an empty DataFrame right away + if self.record_limit is not None and self.record_count > self.record_limit: + return pd.DataFrame() + + result = self._open_manifest(self._manifests[i]) + + # If the dictionary has AT LEAST one value that is not None return a + # DataFrame with the keys as columns, and the values as a row. + # Otherwise return an empty DataFrame that can be concatenated. + # This will prevent rows with all empty values from being generated + # For context see https://github.com/sul-dlss/dlme-airflow/issues/192 + + if result is not None and any(result.values()): + self.record_count += 1 + return pd.DataFrame([result]) + else: + logging.warning(f"{self._manifest_urls[i]} resulted in empty DataFrame") + return pd.DataFrame() + + def _get_schema(self): + for name, info in self.metadata.get("fields", {}).items(): + self._path_expressions[name] = jsonpath_ng.parse(info.get("path")) + self._open_collection() + return intake.source.base.Schema( + datashape=None, + dtype=self.dtype, + shape=None, + npartitions=len(self._manifests), + extra_metadata={}, + ) + + def _get(self, url): + return requests.get(url) + + def read(self): + self._load_metadata() + df = pd.concat([self.read_partition(i) for i in range(self.npartitions)]) + if self.record_limit: + return df.head(self.record_limit) + else: + return df + + +def _stringify_and_strip_if_list(record) -> list[str]: + if isinstance(record, str): + return str(record).strip() + + result_list = [] + for data in record: + result_list.append(_stringify_and_strip_if_list(data)) + + if len(result_list) == 1: + return result_list[0] + + return result_list + + +def _flatten_list(lst: list) -> Generator: + for item in lst: + if type(item) is list: + yield from _flatten_list(item) + else: + yield item diff --git a/dlme_airflow/utils/partition_url_builder.py b/dlme_airflow/utils/partition_url_builder.py index c1bb2f9d..d29535db 100644 --- a/dlme_airflow/utils/partition_url_builder.py +++ b/dlme_airflow/utils/partition_url_builder.py @@ -1,6 +1,6 @@ import requests import jsonpath_ng - +import validators class PartitionBuilder: """Determine the method used to extract or format the @@ -17,6 +17,7 @@ def __init__( self.paging_config = paging_config self.provider_data = None self.api_key = api_key + self.data = [] def urls(self): if self.paging_config.get("pages_url"): @@ -30,6 +31,10 @@ def urls(self): return [] + def records(self): + if self.paging_config.get("pages_url"): + return self._prefetch_page_data() + def _urls_from_provider(self): urls = [self.collection_url] expression = jsonpath_ng.parse(self.paging_config["urls"]) @@ -65,19 +70,35 @@ def _prefetch_page_urls(self): harvested = 0 ids = [] while True: - api_endpoint = f"{self.paging_config['pages_url']}?limit={self.paging_config['limit']}&offset={offset}" - data = self._fetch_provider_data(api_endpoint)["data"] + api_endpoint = self.paging_config['pages_url'].format(offset=offset,limit=self.paging_config['limit']) + data = self._fetch_provider_data(api_endpoint)[self.paging_config['page_data']] offset += self.paging_config["limit"] harvested = len(data) - for i in data: - ids.append(f"{self.collection_url}{i['id']}") + ids += self._extract_ids(data) + if self.paging_config.get("page_fields"): + self.data += self._extract_data(data) if harvested < self.paging_config["limit"]: break return ids + def _prefetch_page_data(self): + offset = 0 + harvested = 0 + data = [] + while True: + api_endpoint = self.paging_config['pages_url'].format(offset=offset,limit=self.paging_config['limit']) + data += self._fetch_provider_data(api_endpoint)[self.paging_config['page_data']] + offset += self.paging_config["limit"] + harvested = len(data) + + if harvested < self.paging_config["limit"]: + break + + return data + def _fetch_provider_data(self, url): headers = {} if self.api_key: @@ -86,3 +107,16 @@ def _fetch_provider_data(self, url): resp = requests.get(url, headers=headers) if resp.status_code == 200: return resp.json() + + def _extract_ids(self, data): + return [self._format_id(i['id']) for i in data] + + def _extract_data(self, data): + return [{ + self._format_id(i['id']): i['thumbnail'][0]['id'] + } for i in data] + + def _format_id(self, id): + if validators.url(id): + return id + return f"{self.collection_url}{id}" diff --git a/tests/data/iiif_v3/collection_items.json b/tests/data/iiif_v3/collection_items.json new file mode 100644 index 00000000..3332b72a --- /dev/null +++ b/tests/data/iiif_v3/collection_items.json @@ -0,0 +1,41 @@ +{ + "@context" : [ "http://www.w3.org/ns/anno.jsonld", "http://iiif.io/api/presentation/3/context.json" ], + "id" : "https://libraries.aub.edu.lb/iiifservices/collection/Posters/0/166", + "type" : "Collection", + "label" : { + "en" : [ "Ranged collection for Posters." ] + }, + "requiredStatement" : { + "label" : { + "en" : [ "Attribution" ] + }, + "value" : { + "en" : [ "AUB University Libraries." ] + } + }, + "items" : [ { + "id" : "https://libraries.aub.edu.lb/iiifservices/item/ark86073b3hd1k/manifest", + "type" : "Manifest", + "label" : { + "en" : [ "Madi" ], + "ar" : [ "ماضي" ] + }, + "thumbnail" : [ { + "id" : "https://libraries.aub.edu.lb/iiifservices/files/posters/ark86073b3hd1k/thumb.jpg", + "type" : "Image", + "format" : "image/jpeg" + } ] + }, { + "id" : "https://libraries.aub.edu.lb/iiifservices/item/ark86073b3705g/manifest", + "type" : "Manifest", + "label" : { + "en" : [ "H. Madi" ], + "ar" : [ "ماضي" ] + }, + "thumbnail" : [ { + "id" : "https://libraries.aub.edu.lb/iiifservices/files/posters/ark86073b3705g/thumb.jpg", + "type" : "Image", + "format" : "image/jpeg" + } ] + } ] +} diff --git a/tests/data/iiif_v3/collection_manifest.json b/tests/data/iiif_v3/collection_manifest.json new file mode 100644 index 00000000..a951a6ba --- /dev/null +++ b/tests/data/iiif_v3/collection_manifest.json @@ -0,0 +1,20 @@ +{ + "@context" : [ "http://www.w3.org/ns/anno.jsonld", "http://iiif.io/api/presentation/3/context.json" ], + "id" : "https://libraries.aub.edu.lb/iiifservices/collection/Posters", + "type" : "Collection", + "label" : { + "en" : [ "Collection for Posters (166 items)." ] + }, + "requiredStatement" : { + "label" : { + "en" : [ "Attribution" ] + }, + "value" : { + "en" : [ "AUB University Libraries." ] + } + }, + "items" : [ { + "id" : "https://libraries.aub.edu.lb/iiifservices/collection/Posters/0/166", + "type" : "Collection" + } ] +} \ No newline at end of file diff --git a/tests/data/iiif_v3/item_manifest.json b/tests/data/iiif_v3/item_manifest.json new file mode 100644 index 00000000..362ec326 --- /dev/null +++ b/tests/data/iiif_v3/item_manifest.json @@ -0,0 +1,170 @@ +{ + "@context" : [ "http://www.w3.org/ns/anno.jsonld", "http://iiif.io/api/presentation/3/context.json" ], + "id" : "https://libraries.aub.edu.lb/iiifservices/item/ark86073b3x34b/manifest", + "type" : "Manifest", + "label" : { + "en" : [ "Nicolas Sursock Museum 21st Autumn Salon" ] + }, + "logo" : [ { + "id" : "https://libraries.aub.edu.lb/iiifservices/files/aub_libraries_logo.jpg", + "type" : "Image", + "format" : "image/jpeg" + } ], + "homepage" : [ { + "id" : "https://libraries.aub.edu.lb/blacklight/catalog/ark86073b3x34b", + "type" : "Text", + "format" : "text/html", + "label" : { + "en" : [ "Homepage" ] + } + } ], + "requiredStatement" : { + "label" : { + "en" : [ "Attribution" ] + }, + "value" : { + "en" : [ "AUB University Libraries." ] + } + }, + "metadata" : [ { + "label" : { + "en" : [ "Title" ] + }, + "value" : { + "en" : [ "Nicolas Sursock Museum 21st Autumn Salon" ] + } + }, { + "label" : { + "ar" : [ "Title" ] + }, + "value" : { + "ar" : [ "متحف نقولا سرسق : معرض الخريف الحادي والعشرون" ] + } + }, { + "label" : { + "fr" : [ "Title" ] + }, + "value" : { + "fr" : [ "Musée Nicolas Sursock XXIe salon d'automne" ] + } + }, { + "label" : { + "none" : [ "Identifier" ] + }, + "value" : { + "none" : [ "ark86073b3x34b" ] + } + }, { + "label" : { + "en" : [ "Language" ] + }, + "value" : { + "en" : [ "Arabic" ] + } + }, { + "label" : { + "en" : [ "Date" ] + }, + "value" : { + "en" : [ "1998" ] + } + }, { + "label" : { + "en" : [ "Author(s)" ] + }, + "value" : { + "en" : [ "Theo" ] + } + }, { + "label" : { + "en" : [ "Description(s)" ] + }, + "value" : { + "en" : [ "Musée Nicolas Sursock XXIe salon d'automne 28 Novembre 1997 - 10 January 1998. Painting reproduced on poster by Theo." ] + } + }, { + "label" : { + "en" : [ "Extent" ] + }, + "value" : { + "en" : [ "printed on paper : 58 x 44 cm" ] + } + }, { + "label" : { + "en" : [ "Subject(s)" ] + }, + "value" : { + "en" : [ "Lebanese painters", "Art Exhibitions -- Lebanon" ] + } + }, { + "label" : { + "en" : [ "Collection" ] + }, + "value" : { + "en" : [ "Sursock Museum Collection", "American University of Beirut Art posters Collection" ] + } + }, { + "label" : { + "en" : [ "Rights" ] + }, + "value" : { + "en" : [ "AUB University Libraries Digital Collections are available under a Creative Commons Attribution-Noncommercial-NoDerivatives 4.0 International License under CC BY-NC-ND 4.0. Except where otherwise stated, anyone is free to download and share works under this license as long as they give credit for the original creation, the shared work is not changed and not used for commercial purposes. Attribution should be given to \"AUB University Libraries.\"" ] + } + }, { + "label" : { + "en" : [ "Homepage" ] + }, + "value" : { + "en" : [ "View on AUB Digital Collections" ] + } + }, { + "label" : { + "en" : [ "Usage Terms" ] + }, + "value" : { + "en" : [ "View Usage Terms on AUB Digital Collections" ] + } + } ], + "items" : [ { + "id" : "https://libraries.aub.edu.lb/blacklight/catalog/ark86073b3x34b/cps_2001d_0147", + "type" : "Canvas", + "label" : { + "en" : [ "1" ] + }, + "height" : 6000, + "width" : 4515, + "items" : [ { + "type" : "AnnotationPage", + "id" : "https://libraries.aub.edu.lb/blacklight/catalog/ark86073b3x34b/cps_2001d_0147/ap", + "items" : [ { + "id" : "https://libraries.aub.edu.lb/blacklight/catalog/ark86073b3x34b/cps_2001d_0147/image", + "type" : "Annotation", + "motivation" : "painting", + "target" : "https://libraries.aub.edu.lb/blacklight/catalog/ark86073b3x34b/cps_2001d_0147", + "body" : { + "id" : "https://lib-iiifmedia.aub.edu.lb/iiif/2/posters%2Fark86073b3x34b%2Fps_2001d_0147.jpg/full/full/0/default.jpg", + "type" : "Image", + "format" : "image/jpeg", + "height" : 6000, + "width" : 4515, + "service" : [ { + "id" : "https://lib-iiifmedia.aub.edu.lb/iiif/2/posters%2Fark86073b3x34b%2Fps_2001d_0147.jpg", + "type" : "ImageService2", + "profile" : "http://iiif.io/api/image/2/level2.json" + } ] + } + } ] + } ], + "thumbnail" : [ { + "id" : "https://lib-iiifmedia.aub.edu.lb/iiif/2/posters%2Fark86073b3x34b%2Fps_2001d_0147.jpg/full/91,/0/default.jpg", + "type" : "Image", + "format" : "image/jpeg", + "service" : [ { + "id" : "https://lib-iiifmedia.aub.edu.lb/iiif/2/posters%2Fark86073b3x34b%2Fps_2001d_0147.jpg", + "type" : "ImageService2", + "profile" : "http://iiif.io/api/image/2/level2.json" + } ] + } ] + } ], + "viewingDirection" : "right-to-left" +} diff --git a/tests/drivers/test_iiif_json_v3.py b/tests/drivers/test_iiif_json_v3.py new file mode 100644 index 00000000..9a04d990 --- /dev/null +++ b/tests/drivers/test_iiif_json_v3.py @@ -0,0 +1,173 @@ +import json +import logging +import pytest +import requests + +from dlme_airflow.drivers.iiif_json_v3 import IiifV3JsonSource + +LOGGER = logging.getLogger(__name__) + + +class MockIIIFCollectionV2Response: + @property + def status_code(self): + return 200 + + @staticmethod + def json(): + return { + "manifests": [ + {"@id": "https://collection.edu/iiif/p15795coll29:28/manifest.json"} + ] + } + + +class MockIIIFCollectionV3Response: + @property + def status_code(self): + return 200 + + @staticmethod + def json(): + with open("tests/data/iiif_v3/collection_items.json") as f: + data = json.load(f) + return data + +class MockIIIFManifestResponse: + @property + def status_code(self): + return 200 + + @staticmethod + def json(): + with open("tests/data/iiif_v3/item_manifest.json") as f: + data = json.load(f) + return data + + +@pytest.fixture +def mock_response(monkeypatch): + def mock_get(*args, **kwargs): + if args[0].endswith("v2_collection.json"): + return MockIIIFCollectionV2Response() + # if args[0].endswith("v3_collection.json"): + # return MockIIIFCollectionV3Response() + if args[0].endswith("manifest"): + return MockIIIFManifestResponse() + if "iiifservices/collection/al-Adab" in args[0]: + return MockIIIFCollectionV3Response() + return + + monkeypatch.setattr(requests, "get", mock_get) + + +@pytest.fixture +def iiif_test_v3_source(): + metadata = { + "fields": { + "id": { + "path": "id", + }, # a specified field with one value in the metadata + } + } + paging = { + "pages_url": "https://iiif_v3_collection/iiifservices/collection/al-Adab/{offset}/{limit}", + "page_data": "items", + "limit": 1000 + } + return IiifV3JsonSource( + collection_url="https://iiif_v3_collection/iiifservices/collection/Posters", + paging=paging, + metadata=metadata + ) + + +def test_IiifJsonSource_initial(iiif_test_v3_source, mock_response): + assert len(iiif_test_v3_source._manifests) == 0 + + +def test_IiifJsonSource_get_schema(iiif_test_v3_source, mock_response): + iiif_test_v3_source._get_schema() + assert len(iiif_test_v3_source._manifests) == 2 + assert ( + iiif_test_v3_source._manifests[0]["id"] + == "https://libraries.aub.edu.lb/iiifservices/item/ark86073b3hd1k/manifest" + ) + + +def test_IiifJsonSource_read(iiif_test_v3_source, mock_response): + iiif_df = iiif_test_v3_source.read() + print(f"Columns: {iiif_df.columns}") + test_columns = [ + "id", + "title", + "identifier", + "language", + "date", + "authors", + "descriptions", + "extent", + "subjects", + "collection", + "rights", + ] + assert all([a == b for a, b in zip(iiif_df.columns, test_columns)]) + + +def test_IiifJsonSource_df(iiif_test_v3_source, mock_response): + iiif_df = iiif_test_v3_source.read() + assert len(iiif_df.get('id')) == 2 + assert iiif_df.get('id').to_list() == ["https://libraries.aub.edu.lb/iiifservices/item/ark86073b3x34b/manifest", "https://libraries.aub.edu.lb/iiifservices/item/ark86073b3x34b/manifest"] + + +@pytest.fixture +def iiif_test_v3_source_with_profile(): + metadata = { + "fields": { + "thumbnail": { + "path": "thumbnail", + "optional": True, + }, # a specified field with one value in the metadata + "profile": { + "path": "profile", + "optional": False, + }, + } + } + paging = { + "pages_url": "https://iiif_v3_collection/iiifservices/collection/al-Adab/{offset}/{limit}", + "page_data": "items", + "limit": 1000 + } + return IiifV3JsonSource( + collection_url="http://iiif_v3_collection/iiifservices/collection/al-Adab", + paging=paging, + metadata=metadata + ) + +def test_IiifJsonSource_logging(iiif_test_v3_source_with_profile, mock_response, caplog): + with caplog.at_level(logging.WARNING): + iiif_test_v3_source_with_profile.read() + print(f"CAPLOG: {caplog.text}") + assert ( + "https://libraries.aub.edu.lb/iiifservices/item/ark86073b3x34b/manifest missing required field: 'profile'; searched path: 'profile'" # noqa: E501 + in caplog.text + ) + assert "missing optional field" not in caplog.text + + with caplog.at_level(logging.DEBUG): + iiif_test_v3_source_with_profile.read() + assert ( + "https://libraries.aub.edu.lb/iiifservices/item/ark86073b3x34b/manifest missing optional field: 'thumbnail'; searched path: 'thumbnail'" # noqa: E501 + in caplog.text + ) + + +# def test_wait(iiif_test_v3_source): +# driver = IiifV3JsonSource("https://example.com/iiif/", wait=2) +# assert driver, "IiifJsonSource constructor accepts wait parameter" + + +def test_list_encode(iiif_test_v3_source, mock_response): + iiif_df = iiif_test_v3_source.read() + assert iiif_df.get("date").to_list() == [["1998"], ["1998"]] diff --git a/tests/drivers/test_json.py b/tests/drivers/test_json.py index 7d439db3..7e246a20 100644 --- a/tests/drivers/test_json.py +++ b/tests/drivers/test_json.py @@ -161,9 +161,10 @@ def test_happy_path_prefetch_urls(requests_mock): record_selector=record_selector, metadata=metadata, paging={ - "pages_url": "https://example.com/collection", + "pages_url": "https://example.com/collection?limit={limit}&offset={offset}", "urls": "data.id", "limit": 3, + "page_data": "data", }, ) diff --git a/tests/models/test_collection.py b/tests/models/test_collection.py index 8946765a..775e7c05 100644 --- a/tests/models/test_collection.py +++ b/tests/models/test_collection.py @@ -7,28 +7,28 @@ def test_Collection(): provider = Provider("aub") - collection = Collection(provider, "aco") - assert collection.label() == "aub_aco" - assert collection.data_path() == "aub/aco" - assert collection.intermediate_representation_location() == "output-aub-aco.ndjson" + collection = Collection(provider, "aladab") + assert collection.label() == "aub_aladab" + assert collection.data_path() == "aub/aladab" + assert collection.intermediate_representation_location() == "output-aub-aladab.ndjson" def test_datafile(): provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") working_data_path = os.path.abspath("working") assert collection.datafile("csv") == os.path.join( - working_data_path, "aub", "aco", "data.csv" + working_data_path, "aub", "aladab", "data.csv" ) assert collection.datafile("json") == os.path.join( - working_data_path, "aub", "aco", "data.json" + working_data_path, "aub", "aladab", "data.json" ) def test_Provider_NotSupported(): with pytest.raises(Exception) as error: provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") collection.datafile("xml") assert str(error.value) == "Unsupported data output format: xml" diff --git a/tests/support/schemas/schema.yaml b/tests/support/schemas/schema.yaml index 1f61c07c..0da12c0d 100644 --- a/tests/support/schemas/schema.yaml +++ b/tests/support/schemas/schema.yaml @@ -25,6 +25,7 @@ properties: - custom_json - json - iiif_json + - iiif_json_v3 - oai_xml - sequential_csv - xml diff --git a/tests/tasks/test_archive.py b/tests/tasks/test_archive.py index 8b306333..3710174a 100644 --- a/tests/tasks/test_archive.py +++ b/tests/tasks/test_archive.py @@ -9,7 +9,7 @@ test_working = Path("test-working") test_archive = Path("test-archive") -test_dir = test_working / "aub" / "aco" +test_dir = test_working / "aub" / "aladab" test_csv = test_dir / "data.csv" test_json = test_dir / "data.json" test_now = datetime.datetime(2023, 3, 13, 18, 6, 31) @@ -56,13 +56,13 @@ def setup(monkeypatch): def test_archive_dir(): provider = Provider("aub") - collection = Collection(provider, "aco") - assert collection.archive_dir().endswith("archive/aub/aco") + collection = Collection(provider, "aladab") + assert collection.archive_dir().endswith("archive/aub/aladab") def test_csv_with_data(setup, mock_collection_datafile, mock_now): provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") fh = test_csv.open("w") fh.write("id,author,title\n") @@ -73,7 +73,7 @@ def test_csv_with_data(setup, mock_collection_datafile, mock_now): assert result is not None assert result["csv"].endswith( - "test-archive/aub/aco/data-20230313180631.csv" + "test-archive/aub/aladab/data-20230313180631.csv" ), "returned CSV archive filename" assert Path(result["csv"]).is_file(), "archived file exists" assert test_csv.is_file(), "original data file should still be there" @@ -81,7 +81,7 @@ def test_csv_with_data(setup, mock_collection_datafile, mock_now): def test_empty_csv(setup, mock_collection_datafile, mock_now): provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") test_csv.touch() result = archive_collection(collection=collection) @@ -91,7 +91,7 @@ def test_empty_csv(setup, mock_collection_datafile, mock_now): def test_csv_with_header(setup, mock_collection_datafile, mock_now): provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") test_csv.open("w").write("id,author,title\n") @@ -103,7 +103,7 @@ def test_csv_with_header(setup, mock_collection_datafile, mock_now): # mock_now not used here since we want to call at two different times def test_identical_csv(setup, mock_collection_datafile): provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") fh = test_csv.open("w") fh.write("id,author,title\n") @@ -132,7 +132,7 @@ def test_identical_csv(setup, mock_collection_datafile): def test_json_with_data(setup, mock_collection_datafile, mock_now): provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") fh = test_json.open("w") fh.write("""{"id": 1, "title": "Maṭnawīye Ma'nawī"}\n""") @@ -142,7 +142,7 @@ def test_json_with_data(setup, mock_collection_datafile, mock_now): assert len(result) != 0 assert result["json"].endswith( - "test-archive/aub/aco/data-20230313180631.json" + "test-archive/aub/aladab/data-20230313180631.json" ), "returned archived json filename" assert Path(result["json"]).is_file(), "archived file exists" assert test_json.is_file(), "original data file should still be there" @@ -150,7 +150,7 @@ def test_json_with_data(setup, mock_collection_datafile, mock_now): def test_empty_json(setup, mock_collection_datafile, mock_now): provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") test_json.touch() result = archive_collection(collection=collection) diff --git a/tests/tasks/test_index.py b/tests/tasks/test_index.py index 42baa9aa..f0fc557e 100644 --- a/tests/tasks/test_index.py +++ b/tests/tasks/test_index.py @@ -24,6 +24,6 @@ def mock_post(self, **data): def test_index_collection(mock_request): provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") params = {"collection": collection} assert index_collection(**params) == "Harvest successfully initiated" diff --git a/tests/utils/test_dataframe.py b/tests/utils/test_dataframe.py index cc345eac..e647f406 100644 --- a/tests/utils/test_dataframe.py +++ b/tests/utils/test_dataframe.py @@ -22,13 +22,13 @@ def mock_datafile(_, format): def test_dataframe_from_csv_file(mock_collection_datafile): provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") assert dataframe_from_file(collection)["id"].count() == 4 def test_dataframe_from_json_file(mock_collection_datafile): provider = Provider("aub") - collection = Collection(provider, "aco") + collection = Collection(provider, "aladab") assert dataframe_from_file(collection, "json")["id"].count() == 2 diff --git a/tests/utils/test_partition_url_builder.py b/tests/utils/test_partition_url_builder.py index 9db23bf4..67acf94a 100644 --- a/tests/utils/test_partition_url_builder.py +++ b/tests/utils/test_partition_url_builder.py @@ -86,9 +86,10 @@ def test_prefetch_page_urls(requests_mock): partitionBuilder = PartitionBuilder( collection_url=collection_url, paging_config={ - "pages_url": "https://example.com/collection", + "pages_url": "https://example.com/collection?limit={limit}&offset={offset}", "urls": "data.id", "limit": 3, + "page_data": "data", }, )