Add IIIF_v3 Driver to support hashed metadata (#528)

* Add IIIF_v3 Driver to support hashed metadata Refactoring Refactoring to improve complexity * Update tests to match updated paging configuration * Update brooklyn config to support new paging options Remove extra list flattening Update tests after removing aub/aco collection WIP Remove debugging print statements
sul-dlss · Jan 8, 2025 · 51ba734 · 51ba734
1 parent f6dc04c
commit 51ba734
Show file tree

Hide file tree

Showing 17 changed files with 713 additions and 102 deletions.
diff --git a/bin/get b/bin/get
@@ -19,6 +19,8 @@ def main(opts):
     if collection is None:
         sys.exit(f'💥 Provider "{opts.provider}" does not have a collection "{opts.collection}"')
 
+    print(f"Harvesting collection: {opts.collection}")
+
     # set driver record limit if it is allowed by the driver
     if opts.limit:
         if hasattr(collection.catalog, 'record_limit'):

diff --git a/catalogs/aub.yaml b/catalogs/aub.yaml
@@ -3,105 +3,88 @@ metadata:
   data_path: aub
   schedule: "30 13 15 Jan,Apr,Jul,Oct *"
 sources:
-  aco:
-    driver: oai_xml
-    args:
-      collection_url: https://libraries.aub.edu.lb/xtf/oai
-      metadata_prefix: oai_dc
-      set: "aco"
-      allow_expiration: true
-      full_harvest: true
-    metadata:
-      data_path: aub/aco
-      config: aub
-      fields:
-        id:
-          path: "//header:identifier"
-          namespace:
-            header: "http://www.openarchives.org/OAI/2.0/"
-          optional: true
   aladab:
-    driver: oai_xml
+    driver: iiif_json_v3
     args:
-      collection_url: https://libraries.aub.edu.lb/xtf/oai
-      metadata_prefix: oai_dc
-      set: "aladab"
-      allow_expiration: true
-      full_harvest: true
+      collection_url: https://libraries.aub.edu.lb/iiifservices/collection/al-Adab
+      paging:
+        pages_url: https://libraries.aub.edu.lb/iiifservices/collection/al-Adab/{offset}/{limit}
+        page_data: items
+        page_fields:
+          - id
+          - thumbnail.id
+        limit: 1010
     metadata:
       data_path: aub/aladab
       config: aub
       fields:
         id:
-          path: "//header:identifier"
-          namespace:
-            header: "http://www.openarchives.org/OAI/2.0/"
-          optional: true
+          path: "id"
   postcards:
-    driver: oai_xml
+    driver: iiif_json_v3
     args:
-      collection_url: https://libraries.aub.edu.lb/xtf/oai
-      metadata_prefix: oai_dc
-      set: "postcards"
-      allow_expiration: true
-      full_harvest: true
+      collection_url: https://libraries.aub.edu.lb/iiifservices/collection/Postcards
+      paging:
+        pages_url: https://libraries.aub.edu.lb/iiifservices/collection/Postcards/{offset}/{limit}
+        page_data: items
+        page_fields:
+          - id
+          - thumbnail.id
+        limit: 1010
     metadata:
       data_path: aub/postcards
       config: aub
       fields:
         id:
-          path: "//header:identifier"
-          namespace:
-            header: "http://www.openarchives.org/OAI/2.0/"
-          optional: true
+          path: "id"
   posters:
-    driver: oai_xml
+    driver: iiif_json_v3
     args:
-      collection_url: https://libraries.aub.edu.lb/xtf/oai
-      metadata_prefix: oai_dc
-      set: "posters"
-      allow_expiration: true
-      full_harvest: true
+      collection_url: https://libraries.aub.edu.lb/iiifservices/collection/Posters
+      paging:
+        pages_url: https://libraries.aub.edu.lb/iiifservices/collection/Posters/{offset}/{limit}
+        page_data: items
+        page_fields:
+          - id
+          - thumbnail.id
+        limit: 1000
     metadata:
-      data_path: aub/posters
+      data_path: aub/postcards
       config: aub
       fields:
         id:
-          path: "//header:identifier"
-          namespace:
-            header: "http://www.openarchives.org/OAI/2.0/"
-          optional: true
-  thamarat_al_funun:
-    driver: oai_xml
+          path: "id"
+  travelbooks:
+    driver: iiif_json_v3
     args:
-      collection_url: https://libraries.aub.edu.lb/xtf/oai
-      metadata_prefix: oai_dc
-      set: "thf"
-      allow_expiration: true
-      full_harvest: true
+      collection_url: https://libraries.aub.edu.lb/iiifservices/collection/Travel%20Books
+      paging:
+        pages_url: https://libraries.aub.edu.lb/iiifservices/collection/Travel%20Books/{offset}/{limit}
+        page_data: items
+        page_fields:
+          - id
+          - thumbnail.id
+        limit: 1000
     metadata:
-      data_path: aub/thamarat_al_funun
+      data_path: aub/postcards
       config: aub
       fields:
         id:
-          path: "//header:identifier"
-          namespace:
-            header: "http://www.openarchives.org/OAI/2.0/"
-          optional: true
-  travelbooks:
-    driver: oai_xml
+          path: "id"
+  manuscripts:
+    driver: iiif_json_v3
     args:
-      collection_url: https://libraries.aub.edu.lb/xtf/oai
-      metadata_prefix: oai_dc
-      set: "travelbooks"
-      allow_expiration: true
-      full_harvest: true
+      collection_url: https://libraries.aub.edu.lb/iiifservices/collection/Manuscripts
+      paging:
+        pages_url: https://libraries.aub.edu.lb/iiifservices/collection/Manuscripts/{offset}/{limit}
+        page_data: items
+        page_fields:
+          - id
+          - thumbnail.id
+        limit: 1000
     metadata:
-      data_path: aub/travelbooks
+      data_path: aub/postcards
       config: aub
       fields:
         id:
-          path: "//header:identifier"
-          namespace:
-            header: "http://www.openarchives.org/OAI/2.0/"
-          optional: true
+          path: "id"
diff --git a/catalogs/brooklyn.yaml b/catalogs/brooklyn.yaml
@@ -8,9 +8,10 @@ sources:
     args:
       collection_url: https://www.brooklynmuseum.org/api/v2/object/
       paging:
-        pages_url: https://www.brooklynmuseum.org/api/v2/collection/5/object
+        pages_url: https://www.brooklynmuseum.org/api/v2/collection/5/object?limit={limit}&offset={offset}  
         urls: data.id
         limit: 25
+        page_data: data
       record_selector: "data"
       api_key: "0IzFpBiUksT8LMVGLUxovj9IR0ltlSH1"
     metadata:

diff --git a/dlme_airflow/drivers/__init__.py b/dlme_airflow/drivers/__init__.py
@@ -1,6 +1,7 @@
 import intake
 
 from dlme_airflow.drivers.iiif_json import IiifJsonSource
+from dlme_airflow.drivers.iiif_json_v3 import IiifV3JsonSource
 from dlme_airflow.drivers.oai_xml import OaiXmlSource
 from dlme_airflow.drivers.xml import XmlSource
 from dlme_airflow.drivers.sequential_csv import SequentialCsvSource
@@ -9,6 +10,7 @@
 
 def register_drivers():
     intake.source.register_driver("iiif_json", IiifJsonSource)
+    intake.source.register_driver("iiif_json_v3", IiifV3JsonSource)
     intake.source.register_driver("oai_xml", OaiXmlSource)
     intake.source.register_driver("xml", XmlSource)
     intake.source.register_driver("sequential_csv", SequentialCsvSource)

diff --git a/dlme_airflow/drivers/iiif_json_v3.py b/dlme_airflow/drivers/iiif_json_v3.py
@@ -0,0 +1,182 @@
+import logging
+import intake
+import requests
+import jsonpath_ng
+import pandas as pd
+from typing import Any, Optional, Generator
+from dlme_airflow.utils.partition_url_builder import PartitionBuilder
+
+
+class IiifV3JsonSource(intake.source.base.DataSource):
+    container = "dataframe"
+    name = "iiif_json_v3"
+    version = "0.0.2"
+    partition_access = True
+
+    def __init__(
+        self,
+        collection_url,
+        paging=None,
+        metadata=None
+    ):
+        super(IiifV3JsonSource, self).__init__(metadata=metadata)
+        self.collection_url = collection_url
+        self.paging = paging
+        self._manifests = []
+        self._path_expressions = {}
+        self.record_count = 0
+        self.record_limit = self.metadata.get("record_limit")
+        self.partition_builder = None
+
+        if self.paging:
+            self.partition_builder = PartitionBuilder(self.collection_url, self.paging)
+
+
+    def _open_collection(self):
+        self._manifests = self._get_manifests()
+
+
+    def _get_manifests(self):
+        if self.paging:
+            return self.partition_builder.records()
+
+
+    def _open_manifest(self, manifest: dict) -> Optional[dict]:
+        manifest_url = manifest["id"]
+        resp = self._get(manifest_url)
+        if resp.status_code == 200:
+            manifest_result = resp.json()
+        else:
+            logging.error(
+                f"got {resp.status_code} when fetching manifest {manifest_url}"
+            )
+            return None
+
+        record = self._extract_specified_fields(manifest_result)
+
+        # Handles metadata in IIIF manifest
+        record.update(
+            self._extract_manifest_metadata(manifest_result.get("metadata", []))
+        )
+
+        # Handles the thumbnail field provided in the collection manifest
+        record.update({"thumbnail": manifest.get("thumbnail")})
+        return record
+
+    def _extract_specified_fields(self, iiif_manifest: dict) -> dict:
+        output: dict [str, Any] = {}
+        for name, info in self.metadata.get("fields").items():
+            result = self._get_data_for_field(name, iiif_manifest)
+
+            if not result:
+                self._optional_field_warning(iiif_manifest.get("id"), name, self._path_expressions.get(name), info.get("optional"))
+                continue
+
+            processed_result = _stringify_and_strip_if_list(result)
+
+            if name in output:
+                output.update({name: _flatten_list([output[name], processed_result])})
+            else:
+                output[name] = processed_result
+
+        return output
+
+
+    def _get_data_for_field(self, field, manifest):
+        expression = self._path_expressions.get(field)
+        return [match.value for match in expression.find(manifest)]
+
+    def _optional_field_warning(self, id, field, expression, optional):
+        if optional is True:
+            logging.debug(f"{id} missing optional field: '{field}'; searched path: '{expression}'")
+            return
+
+        logging.warning(f"{id} missing required field: '{field}'; searched path: '{expression}'")
+
+
+    def _extract_manifest_metadata(
+        self, iiif_manifest_metadata
+    ) -> dict[str, list[str]]:
+        output: dict[str, list[str]] = {}
+
+        for row in iiif_manifest_metadata:
+            (label, values) = self._extract_metadata_for_row(row)
+            output.setdefault(label, []).extend(values)
+
+        return output
+
+    def _extract_metadata_for_row(self, row):
+        values = []
+        lang = next(iter(row.get("label")))
+        label = row.get("label")[lang][0].replace(" ", "-").lower().replace("(", "").replace(")", "")
+        for key in row.get("label").keys():
+            # initialize or append to output[name] based on whether we've seen the label
+            values += row.get("value")[key]
+
+        return label, values
+
+
+    def _get_partition(self, i) -> pd.DataFrame:
+        # if we are over the defined limit return an empty DataFrame right away
+        if self.record_limit is not None and self.record_count > self.record_limit:
+            return pd.DataFrame()
+
+        result = self._open_manifest(self._manifests[i])
+
+        # If the dictionary has AT LEAST one value that is not None return a
+        # DataFrame with the keys as columns, and the values as a row.
+        # Otherwise return an empty DataFrame that can be concatenated.
+        # This will prevent rows with all empty values from being generated
+        # For context see https://github.com/sul-dlss/dlme-airflow/issues/192
+
+        if result is not None and any(result.values()):
+            self.record_count += 1
+            return pd.DataFrame([result])
+        else:
+            logging.warning(f"{self._manifest_urls[i]} resulted in empty DataFrame")
+            return pd.DataFrame()
+
+    def _get_schema(self):
+        for name, info in self.metadata.get("fields", {}).items():
+            self._path_expressions[name] = jsonpath_ng.parse(info.get("path"))
+        self._open_collection()
+        return intake.source.base.Schema(
+            datashape=None,
+            dtype=self.dtype,
+            shape=None,
+            npartitions=len(self._manifests),
+            extra_metadata={},
+        )
+
+    def _get(self, url):
+        return requests.get(url)
+
+    def read(self):
+        self._load_metadata()
+        df = pd.concat([self.read_partition(i) for i in range(self.npartitions)])
+        if self.record_limit:
+            return df.head(self.record_limit)
+        else:
+            return df
+
+
+def _stringify_and_strip_if_list(record) -> list[str]:
+    if isinstance(record, str):
+        return str(record).strip()
+
+    result_list = []
+    for data in record:
+        result_list.append(_stringify_and_strip_if_list(data))
+
+    if len(result_list) == 1:
+        return result_list[0]
+
+    return result_list
+
+
+def _flatten_list(lst: list) -> Generator:
+    for item in lst:
+        if type(item) is list:
+            yield from _flatten_list(item)
+        else:
+            yield item