From 9b2661696db027867061f237639167ed3684c6b5 Mon Sep 17 00:00:00 2001 From: Joe Corall Date: Fri, 19 Apr 2024 11:13:21 -0400 Subject: [PATCH] Add hOCR functionality (#1006) * Add hOCR functionality * Fix wording in Islandora IIIF View Style config form. * Islandora IIIF: Attempt to fix broken config schema. * Islandora IIIF: Remove labels from config schema. * Islandora IIIF: Add labels to config schema items. * Update IIIFManifest.php * Move to memorized term lookup, instead of polluting method call. * Deal explicitly with URIs, instead of attempting to put the entity in config. * hocr Islandora IIIF: Add missing null check. --------- Co-authored-by: Alexander O'Neill Co-authored-by: Alexander O'Neill Co-authored-by: Rosie Le Faive Co-authored-by: Adam Vessey --- .../config/schema/islandora_iiif.schema.yml | 12 ++ .../src/Plugin/views/style/IIIFManifest.php | 135 ++++++++++++++++-- 2 files changed, 136 insertions(+), 11 deletions(-) diff --git a/modules/islandora_iiif/config/schema/islandora_iiif.schema.yml b/modules/islandora_iiif/config/schema/islandora_iiif.schema.yml index 1f91450fa..11fff4c71 100644 --- a/modules/islandora_iiif/config/schema/islandora_iiif.schema.yml +++ b/modules/islandora_iiif/config/schema/islandora_iiif.schema.yml @@ -17,5 +17,17 @@ views.style.iiif_manifest: mapping: iiif_tile_field: type: sequence + label: "Tile source field(s)" sequence: type: string + iiif_ocr_file_field: + type: sequence + label: "Structured OCR data file field" + sequence: + type: string + structured_text_term_uri: + type: string + label: "Structured text term" + search_endpoint: + type: string + label: "Search endpoint path" diff --git a/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php b/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php index 2fc8a431a..f0aca47da 100644 --- a/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php +++ b/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php @@ -11,6 +11,8 @@ use Drupal\Core\Form\FormStateInterface; use Drupal\Core\Messenger\MessengerInterface; use Drupal\Core\Url; +use Drupal\islandora\IslandoraUtils; +use Drupal\taxonomy\TermInterface; use Drupal\views\Plugin\views\style\StylePluginBase; use Drupal\views\ResultRow; use GuzzleHttp\Client; @@ -35,6 +37,13 @@ */ class IIIFManifest extends StylePluginBase { + /** + * Islandora utility functions. + * + * @var \Drupal\islandora\IslandoraUtils + */ + protected $utils; + /** * {@inheritdoc} */ @@ -108,10 +117,24 @@ class IIIFManifest extends StylePluginBase { */ protected $moduleHandler; + /** + * Memoized structured text term. + * + * @var \Drupal\taxonomy\TermInterface|null + */ + protected ?TermInterface $structuredTextTerm; + + /** + * Flag to track if we _have_ attempted a lookup, as the value is nullable. + * + * @var bool + */ + protected bool $structuredTextTermMemoized = FALSE; + /** * {@inheritdoc} */ - public function __construct(array $configuration, $plugin_id, $plugin_definition, SerializerInterface $serializer, Request $request, ImmutableConfig $iiif_config, EntityTypeManagerInterface $entity_type_manager, FileSystemInterface $file_system, Client $http_client, MessengerInterface $messenger, ModuleHandlerInterface $moduleHandler) { + public function __construct(array $configuration, $plugin_id, $plugin_definition, SerializerInterface $serializer, Request $request, ImmutableConfig $iiif_config, EntityTypeManagerInterface $entity_type_manager, FileSystemInterface $file_system, Client $http_client, MessengerInterface $messenger, ModuleHandlerInterface $moduleHandler, IslandoraUtils $utils) { parent::__construct($configuration, $plugin_id, $plugin_definition); $this->serializer = $serializer; @@ -121,6 +144,7 @@ public function __construct(array $configuration, $plugin_id, $plugin_definition $this->fileSystem = $file_system; $this->httpClient = $http_client; $this->messenger = $messenger; + $this->utils = $utils; $this->moduleHandler = $moduleHandler; } @@ -139,7 +163,8 @@ public static function create(ContainerInterface $container, array $configuratio $container->get('file_system'), $container->get('http_client'), $container->get('messenger'), - $container->get('module_handler') + $container->get('module_handler'), + $container->get('islandora.utils') ); } @@ -217,6 +242,9 @@ public function render() { $content_type = 'json'; + // Add a search endpoint if one is defined. + $this->addSearchEndpoint($json, $url_components); + // Give other modules a chance to alter the manifest. $this->moduleHandler->alter('islandora_iiif_manifest', $json, $this); @@ -300,7 +328,7 @@ protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_bas ], ]; - if ($ocr_url = $this->getOcrUrl($entity, $row, $i)) { + if ($ocr_url = $this->getOcrUrl($entity)) { $tmp_canvas['seeAlso'] = [ '@id' => $ocr_url, 'format' => 'text/vnd.hocr+html', @@ -380,30 +408,38 @@ protected function getCanvasDimensions(string $iiif_url, FieldItemInterface $ima * * @param \Drupal\Core\Entity\EntityInterface $entity * The entity at the current row. - * @param \Drupal\views\ResultRow $row - * Result row. - * @param int $delta - * The delta in case there are multiple canvases on one media. * * @return string|false * The absolute URL of the current row's structured text, * or FALSE if none. */ - protected function getOcrUrl(EntityInterface $entity, ResultRow $row, $delta) { + protected function getOcrUrl(EntityInterface $entity) { $ocr_url = FALSE; $iiif_ocr_file_field = !empty($this->options['iiif_ocr_file_field']) ? array_filter(array_values($this->options['iiif_ocr_file_field'])) : []; $ocrField = count($iiif_ocr_file_field) > 0 ? $this->view->field[$iiif_ocr_file_field[0]] : NULL; if ($ocrField) { - $ocr_entity = $ocrField->getEntity($row); + $ocr_entity = $entity; $ocr_field_name = $ocrField->definition['field_name']; if (!is_null($ocr_field_name)) { $ocrs = $ocr_entity->{$ocr_field_name}; - $ocr = isset($ocrs[$delta]) ? $ocrs[$delta] : FALSE; + $ocr = $ocrs[0] ?? FALSE; if ($ocr) { $ocr_url = $ocr->entity->createFileUrl(FALSE); } } } + elseif ($structured_text_term = $this->getStructuredTextTerm()) { + $parent_node = $this->utils->getParentNode($entity); + $ocr_entity_array = $this->utils->getMediaReferencingNodeAndTerm($parent_node, $structured_text_term); + $ocr_entity_id = is_array($ocr_entity_array) ? array_shift($ocr_entity_array) : NULL; + $ocr_entity = $ocr_entity_id ? $this->entityTypeManager->getStorage('media')->load($ocr_entity_id) : NULL; + if ($ocr_entity) { + $ocr_file_source = $ocr_entity->getSource(); + $ocr_fid = $ocr_file_source->getSourceFieldValue($ocr_entity); + $ocr_file = $this->entityTypeManager->getStorage('file')->load($ocr_fid); + $ocr_url = $ocr_file->createFileUrl(FALSE); + } + } return $ocr_url; } @@ -448,6 +484,29 @@ protected function defineOptions() { return $options; } + /** + * Add the configured search endpoint to the manifest. + * + * @param array $json + * The IIIF manifest. + * @param array $url_components + * The search endpoint URL as array. + */ + protected function addSearchEndpoint(array &$json, array $url_components) { + $url_base = $this->getRequest()->getSchemeAndHttpHost(); + $hocr_search_path = $this->options['search_endpoint']; + $hocr_search_url = $url_base . '/' . ltrim($hocr_search_path, '/'); + + $hocr_search_url = str_replace('%node', $url_components[1], $hocr_search_url); + + $json['service'][] = [ + "@context" => "http://iiif.io/api/search/0/context.json", + "@id" => $hocr_search_url, + "profile" => "http://iiif.io/api/search/0/search", + "label" => t("Search inside this work"), + ]; + } + /** * {@inheritdoc} */ @@ -504,10 +563,27 @@ public function buildOptionsForm(&$form, FormStateInterface $form_state) { '#title' => $this->t('Structured OCR data file field'), '#type' => 'checkboxes', '#default_value' => $this->options['iiif_ocr_file_field'], - '#description' => $this->t('The source of structured OCR text for each entity.'), + '#description' => $this->t("If the hOCR is a field on the same entity as the image source field above, select it here. If it's found in a related entity via the term below, leave this blank."), '#options' => $field_options, '#required' => FALSE, ]; + + $form['structured_text_term'] = [ + '#type' => 'entity_autocomplete', + '#target_type' => 'taxonomy_term', + '#title' => $this->t('Structured OCR text term'), + '#default_value' => $this->getStructuredTextTerm(), + '#required' => FALSE, + '#description' => $this->t('Term indicating the media that holds structured text, such as hOCR, for the given object. Use this if the text is on a separate media from the tile source.'), + ]; + + $form['search_endpoint'] = [ + '#type' => 'textfield', + '#title' => $this->t("Search endpoint path."), + '#description' => $this->t("If there is a search endpoint to search within the book that returns IIIF annotations, put it here. Use %node substitution where needed.
E.g., paged-content-search/%node"), + '#default_value' => $this->options['search_endpoint'], + '#required' => FALSE, + ]; } /** @@ -520,4 +596,41 @@ public function getFormats() { return ['json' => 'json']; } + /** + * Submit handler for options form. + * + * Used to store the structured text media term by URL instead of Ttid. + * + * @param array $form + * The form. + * @param \Drupal\Core\Form\FormStateInterface $form_state + * The form state object. + */ + // @codingStandardsIgnoreStart + public function submitOptionsForm(&$form, FormStateInterface $form_state) { + // @codingStandardsIgnoreEnd + $style_options = $form_state->getValue('style_options'); + $tid = $style_options['structured_text_term']; + unset($style_options['structured_text_term']); + $term = $this->entityTypeManager->getStorage('taxonomy_term')->load($tid); + $style_options['structured_text_term_uri'] = $this->utils->getUriForTerm($term); + $form_state->setValue('style_options', $style_options); + parent::submitOptionsForm($form, $form_state); + } + + /** + * Get the structured text term. + * + * @return \Drupal\taxonomy\TermInterface|null + * The term if it could be found; otherwise, NULL. + */ + protected function getStructuredTextTerm() : ?TermInterface { + if (!$this->structuredTextTermMemoized) { + $this->structuredTextTermMemoized = TRUE; + $this->structuredTextTerm = $this->utils->getTermForUri($this->options['structured_text_term_uri']); + } + + return $this->structuredTextTerm; + } + }