diff --git a/dlme_airflow/drivers/iiif_json_v3.py b/dlme_airflow/drivers/iiif_json_v3.py index 73eb2cf..3b60e30 100644 --- a/dlme_airflow/drivers/iiif_json_v3.py +++ b/dlme_airflow/drivers/iiif_json_v3.py @@ -7,14 +7,21 @@ from typing import Any, Optional, Generator from dlme_airflow.utils.partition_url_builder import PartitionBuilder -container = "dataframe" -name = "iiif_json_v3" -version = "0.0.2" -partition_access = True - class IiifV3JsonSource(intake.source.base.DataSource): - def __init__(self, collection_url, dtype=None, metadata=None, wait=None, paging=None): + container = "dataframe" + name = "iiif_json_v3" + version = "0.0.2" + partition_access = True + + def __init__( + self, + collection_url, + dtype=None, + metadata=None, + wait=None, + paging=None + ): super(IiifV3JsonSource, self).__init__(metadata=metadata) self.collection_url = collection_url self.dtype = dtype diff --git a/dlme_airflow/utils/partition_url_builder.py b/dlme_airflow/utils/partition_url_builder.py index afbd229..5f7af88 100644 --- a/dlme_airflow/utils/partition_url_builder.py +++ b/dlme_airflow/utils/partition_url_builder.py @@ -66,16 +66,11 @@ def _prefetch_page_urls(self): ids = [] while True: api_endpoint = self.paging_config['pages_url'].format(offset,self.paging_config['limit']) - print(f"Fetching {api_endpoint}") data = self._fetch_provider_data(api_endpoint)[self.paging_config['page_data']] offset += self.paging_config["limit"] harvested = len(data) - for i in data: - if validators.url(i['id']): - ids.append(i['id']) - else: - ids.append(f"{self.collection_url}{i['id']}") + ids += self._extract_ids(data) if harvested < self.paging_config["limit"]: break @@ -90,3 +85,11 @@ def _fetch_provider_data(self, url): resp = requests.get(url, headers=headers) if resp.status_code == 200: return resp.json() + + def _extract_ids(self, data): + return [self._format_id(i['id']) for i in data] + + def _format_id(self, id): + if validators.url(id): + return id + return f"{self.collection_url}{id}"