Skip to content

Commit

Permalink
Refactoring to improve complexity
Browse files Browse the repository at this point in the history
  • Loading branch information
aaron-collier committed Oct 1, 2024
1 parent 4d2a788 commit acf80ae
Show file tree
Hide file tree
Showing 2 changed files with 251 additions and 62 deletions.
133 changes: 71 additions & 62 deletions dlme_airflow/drivers/iiif_json_v3.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import time
import logging
import intake
import requests
Expand Down Expand Up @@ -28,37 +27,41 @@ def __init__(
self.record_count = 0
self.record_limit = self.metadata.get("record_limit")


def _open_collection(self):
self._manifest_urls = self._get_manifest_urls()

def _get_manifest_urls(self):
if self.paging:
return PartitionBuilder(self.collection_url, self.paging).urls()
else:
return self._get_manifest_urls_from_items()


def _get_manifest_urls_from_items(self):
resp = self._get(self.collection_url)
if resp.status_code == 200:
if self.paging:
self._manifest_urls = PartitionBuilder(self.collection_url, self.paging).urls()
else:
collection_result = resp.json()
if "manifests" in collection_result: # IIIF v2
manifests = collection_result["manifests"]
elif "items" in collection_result: # IIIF v3
manifests = collection_result["items"]
else:
raise Exception(
f"Unknown collection manifest format: {self.collection_url}"
)

for manifest in manifests:
if "@id" in manifest:
url = manifest["@id"] # valid in IIIF v2 or v3
elif "id" in manifest:
url = manifest["id"] # valid in IIIF v3 only
else:
raise Exception(f"Unknown URL in manifest: {manifest}")
self._manifest_urls.append(url)
collection_result = resp.json()
urls = []
if "items" in collection_result: # IIIF v3
manifests = collection_result["items"]
else:
logging.error(f"got {resp.status_code} when fetching {self.collection_url}")
raise Exception(
f"Unknown collection manifest format: {self.collection_url}"
)

for manifest in manifests:
if "@id" in manifest:
url = manifest["@id"] # valid in IIIF v2 or v3
elif "id" in manifest:
url = manifest["id"] # valid in IIIF v3 only
else:
raise Exception(f"Unknown URL in manifest: {manifest}")
urls.append(url)

return urls

def _open_manifest(self, manifest_url: str) -> Optional[dict]:
logging.info(f"getting manifest {manifest_url}")
print(f"Getting manifest {manifest_url}")

def _open_manifest(self, manifest_url: str) -> Optional[dict]:
resp = self._get(manifest_url)
if resp.status_code == 200:
manifest_result = resp.json()
Expand All @@ -77,34 +80,36 @@ def _open_manifest(self, manifest_url: str) -> Optional[dict]:
return record

def _extract_specified_fields(self, iiif_manifest: dict) -> dict:
output: dict[str, Any] = {}
output: dict [str, Any] = {}
for name, info in self.metadata.get("fields").items():
expression = self._path_expressions.get(name)
result = [match.value for match in expression.find(iiif_manifest)]
if (
len(result) < 1
): # the JSONPath expression didn't find anything in the manifest
if info.get("optional") is True:
logging.debug(
f"{iiif_manifest.get('@id')} missing optional field: '{name}'; searched path: '{expression}'"
)
else:
logging.warning(
f"{iiif_manifest.get('@id')} missing required field: '{name}'; searched path: '{expression}'"
)
result = self._get_data_for_field(name, iiif_manifest)

if not result:
self._optional_field_warning(iiif_manifest.get("id"), name, self._path_expressions.get(name), info.get("optional"))
continue

processed_result = _stringify_and_strip_if_list(result)

if name in output:
output.update({name: _flatten_list([output[name], processed_result])})
else:
if (
len(result) == 1
): # the JSONPath expression found exactly one result in the manifest
output[name] = _stringify_and_strip_if_list(result[0])
else: # the JSONPath expression found exactly one result in the manifest
if name not in output:
output[name] = []

for data in result:
output[name].append(_stringify_and_strip_if_list(data))
output[name] = processed_result

return output


def _get_data_for_field(self, field, manifest):
expression = self._path_expressions.get(field)
return [match.value for match in expression.find(manifest)]

def _optional_field_warning(self, id, field, expression, optional):
if optional is True:
logging.debug(f"{id} missing optional field: '{field}'; searched path: '{expression}'")
return

logging.warning(f"{id} missing required field: '{field}'; searched path: '{expression}'")


def _extract_manifest_metadata(
self, iiif_manifest_metadata
) -> dict[str, list[str]]:
Expand All @@ -120,17 +125,14 @@ def _extract_manifest_metadata(
.replace(")", "")
)
# initialize or append to output[name] based on whether we've seen the label
metadata_value = row.get("value")[key][0]
metadata_value = row.get("value")[key]
if not metadata_value:
continue

if isinstance(metadata_value[0], dict):
metadata_value = metadata_value[0].get("@value")

if name in output:
output[name].append(metadata_value)
output.update({name: _flatten_list([output[name], metadata_value])})
else:
output[name] = [metadata_value]
output[name] = metadata_value

# flatten any nested lists into a single list
return {k: list(_flatten_list(v)) for (k, v) in output.items()}
Expand Down Expand Up @@ -179,11 +181,18 @@ def read(self):
return df


def _stringify_and_strip_if_list(possible_list) -> list[str]:
if isinstance(possible_list, list):
return [str(elt).strip() for elt in possible_list]
else:
return possible_list
def _stringify_and_strip_if_list(record) -> list[str]:
if isinstance(record, str):
return str(record).strip()

result_list = []
for data in record:
result_list.append(_stringify_and_strip_if_list(data))

if len(result_list) == 1:
return result_list[0]

return result_list


def _flatten_list(lst: list) -> Generator:
Expand Down
180 changes: 180 additions & 0 deletions tests/drivers/test_iiif_json_v3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
import logging
import pytest
import requests
import pandas as pd

from dlme_airflow.drivers.iiif_json_v3 import IiifV3JsonSource

LOGGER = logging.getLogger(__name__)


class MockIIIFCollectionV2Response:
@property
def status_code(self):
return 200

@staticmethod
def json():
return {
"manifests": [
{"@id": "https://collection.edu/iiif/p15795coll29:28/manifest.json"}
]
}


class MockIIIFCollectionV3Response:
@property
def status_code(self):
return 200

@staticmethod
def json():
return {
"items": [
{"id": "https://collection.edu/iiif/p15795coll29:28/manifest.json"}
]
}


class MockIIIFManifestResponse:
@property
def status_code(self):
return 200

@staticmethod
def json():
return {
"@context": "http://iiif.io/api/presentation/3/context.json",
"id": "https://collection.edu/iiif/p15795coll29:28/manifest.json",
"metadata": [
{
"label": {
"en": ["Source"]
},
"value": {
"en": ["Rare Books and Special Collections Library"]
},
},
{"label": {"en": ["Title (main)"]}, "value": {"en": ["A great title of the Middle East"]}},
{"label": {"en": ["Title (sub)"]}, "value": {"en": ["Subtitle 1"]}},
{"label": {"en": ["Title (sub)"]}, "value": {"en": ["Subtitle 2"]}},
{"label": {"en": ["Date Created"]}, "value": {"en": [["1974"]]}},
],
"sequences": [
{
"canvases": [
{"images": [{"resource": {"format": "image/jpeg"}}]},
{"images": [{"resource": {"format": "image/jpeg"}}]},
]
}
],
"description": ["A descriptive phrase", " with further elaboration "],
}


@pytest.fixture
def mock_response(monkeypatch):
def mock_get(*args, **kwargs):
if args[0].endswith("v2_collection.json"):
return MockIIIFCollectionV2Response()
if args[0].endswith("v3_collection.json"):
return MockIIIFCollectionV3Response()
if args[0].endswith("manifest.json"):
return MockIIIFManifestResponse()
return

monkeypatch.setattr(requests, "get", mock_get)


@pytest.fixture
def iiif_test_v3_source():
metadata = {
"fields": {
"context": {
"path": "@context",
"optional": True,
}, # a specified field with one value in the metadata
"description_top": {"path": "description", "optional": True},
"iiif_format": {
"path": "sequences..format"
}, # a specified field with multiple values in the metadata
"profile": {"path": "sequences..profile"}, # a missing required field
"thumbnail": {
"path": "thumbnail..@id",
"optional": True,
}, # missing optional field
}
}
return IiifV3JsonSource(
collection_url="http://iiif_v3_collection.json", metadata=metadata
)


def test_IiifJsonSource_initial(iiif_test_v3_source, mock_response):
assert len(iiif_test_v3_source._manifest_urls) == 0


def test_IiifJsonSource_get_schema(iiif_test_v3_source, mock_response):
iiif_test_v3_source._get_schema()
assert (
iiif_test_v3_source._manifest_urls[0]
== "https://collection.edu/iiif/p15795coll29:28/manifest.json"
)


def test_IiifJsonSource_read(iiif_test_v3_source, mock_response):
iiif_df = iiif_test_v3_source.read()
test_columns = [
"context",
"description_top",
"iiif_format",
"source",
"title-main",
"title-sub",
]
assert all([a == b for a, b in zip(iiif_df.columns, test_columns)])


def test_IiifJsonSource_df(iiif_test_v3_source, mock_response):
iiif_df = iiif_test_v3_source.read()
test_df = pd.DataFrame(
[
{
"context": "http://iiif.io/api/presentation/3/context.json",
"description_top": ["A descriptive phrase", "with further elaboration"],
"iiif_format": ["image/jpeg", "image/jpeg"],
"source": ["Rare Books and Special Collections Library"],
"title-main": ["A great title of the Middle East"],
"title-sub": ["Subtitle 1", "Subtitle 2"],
"date-created": ["1974"],
}
]
)
assert iiif_df.equals(test_df)


def test_IiifJsonSource_logging(iiif_test_v3_source, mock_response, caplog):
with caplog.at_level(logging.WARNING):
iiif_test_v3_source.read()
assert (
"https://collection.edu/iiif/p15795coll29:28/manifest.json missing required field: 'profile'; searched path: 'sequences..profile'" # noqa: E501
in caplog.text
)
assert "missing optional field" not in caplog.text

with caplog.at_level(logging.DEBUG):
iiif_test_v3_source.read()
assert (
"https://collection.edu/iiif/p15795coll29:28/manifest.json missing optional field: 'thumbnail'; searched path: 'thumbnail..@id'" # noqa: E501
in caplog.text
)


# def test_wait(iiif_test_v3_source):
# driver = IiifV3JsonSource("https://example.com/iiif/", wait=2)
# assert driver, "IiifJsonSource constructor accepts wait parameter"


def test_list_encode(iiif_test_v3_source, mock_response):
iiif_df = iiif_test_v3_source.read()
assert iiif_df["date-created"][0] == ["1974"]

0 comments on commit acf80ae

Please sign in to comment.