Skip to content

Commit

Permalink
Merge pull request #9 from Daniel-KM/feature/tsv
Browse files Browse the repository at this point in the history
Feature/tsv Thanks @Daniel-KM
  • Loading branch information
smachefert authored Feb 6, 2024
2 parents 9e4cc33 + bf95e88 commit 713a959
Show file tree
Hide file tree
Showing 9 changed files with 725 additions and 259 deletions.
96 changes: 65 additions & 31 deletions Module.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,13 @@
namespace ExtractOcr;

use ExtractOcr\Form\ConfigForm;
use ExtractOcr\Job\ExtractOcr;
use Laminas\EventManager\Event;
use Laminas\EventManager\SharedEventManagerInterface;
use Laminas\Mvc\Controller\AbstractController;
use Laminas\ServiceManager\ServiceLocatorInterface;
use Laminas\View\Renderer\PhpRenderer;
use Omeka\Module\AbstractModule;
use Omeka\Module\Exception\ModuleCannotInstallException;
use Omeka\Settings\SettingsInterface;
use Omeka\Stdlib\Message;

class Module extends AbstractModule
Expand All @@ -23,6 +21,8 @@ public function getConfig()

public function install(ServiceLocatorInterface $services): void
{
$this->setServiceLocator($services);

$t = $services->get('MvcTranslator');

// Don't install if the pdftotext command doesn't exist.
Expand Down Expand Up @@ -59,7 +59,8 @@ public function install(ServiceLocatorInterface $services): void
foreach ($config as $name => $value) {
$settings->set($name, $value);
}
$this->allowXML($services->get('Omeka\Settings'));

$this->allowFileFormats();
}

public function uninstall(ServiceLocatorInterface $services): void
Expand All @@ -74,15 +75,27 @@ public function uninstall(ServiceLocatorInterface $services): void

public function upgrade($oldVersion, $newVersion, ServiceLocatorInterface $services)
{
$this->setServiceLocator($services);

$plugins = $services->get('ControllerPluginManager');
$settings = $services->get('Omeka\Settings');
$messenger = $plugins->get('messenger');

if (version_compare((string) $oldVersion, '3.4.5', '<')) {
$plugins = $services->get('ControllerPluginManager');
$settings = $services->get('Omeka\Settings');
$messenger = $plugins->get('messenger');
$message = new Message('A new option allows to create xml as alto multi-pages.'); // @translate
// Default is alto on install, but pdf2xml during upgrade.
$settings->set('extractocr_media_type', 'application/vnd.pdf2xml+xml');
$messenger->addSuccess($message);
}

if (version_compare((string) $oldVersion, '3.4.6', '<')) {
$settings->set('extractocr_create_empty_file', $settings->get('extractocr_create_empty_xml', false));
$settings->delete('extractocr_create_empty_xml');
$message = new Message('A new option allows to export OCR into tsv format for quicker search results. Data should be reindexed with format TSV.'); // @translate
$messenger->addSuccess($message);
}

$this->allowFileFormats();
}

/**
Expand All @@ -105,31 +118,37 @@ public function attachListeners(SharedEventManagerInterface $sharedEventManager)
}

/**
* Allow XML's extension and media type in omeka's settings
*
* @param SettingsInterface
* Allow TSV and XML extensions and media types in omeka settings.
*/
protected function allowXML(SettingsInterface $settings): void
protected function allowFileFormats(): void
{
$settings = $this->getServiceLocator()->get('Omeka\Settings');

$extensionWhitelist = $settings->get('extension_whitelist', []);
$xmlExtensions = [
$extensions = [
'tsv',
'xml',
];
$extensionWhitelist = array_unique(array_merge($extensionWhitelist, $xmlExtensions));
$extensionWhitelist = array_unique(array_merge($extensionWhitelist, $extensions));
$settings->set('extension_whitelist', $extensionWhitelist);

$mediaTypeWhitelist = $settings->get('media_type_whitelist');
$xmlMediaTypes = [
'application/xml',
'text/xml',
'application/alto+xml',
'application/vnd.pdf2xml+xml',
'application/x-empty',
'text/tab-separated-values',
];
$mediaTypeWhitelist = array_unique(array_merge($mediaTypeWhitelist, $xmlMediaTypes));
$settings->set('media_type_whitelist', $mediaTypeWhitelist);
}

public function getConfigForm(PhpRenderer $renderer)
{
$this->allowFileFormats();

$services = $this->getServiceLocator();
$settings = $services->get('Omeka\Settings');
$form = $services->get('FormElementManager')->get(ConfigForm::class);
Expand Down Expand Up @@ -157,6 +176,7 @@ public function handleConfigForm(AbstractController $controller)
$services = $this->getServiceLocator();
$form = $services->get('FormElementManager')->get(ConfigForm::class);

/** @var \Laminas\Stdlib\Parameters $params */
$params = $controller->getRequest()->getPost();

$form->init();
Expand All @@ -166,26 +186,29 @@ public function handleConfigForm(AbstractController $controller)
return false;
}

$params = $form->getData();
$data = $form->getData();

$settings = $services->get('Omeka\Settings');
$settings->set('extractocr_media_type', $params['extractocr_media_type'] ?: 'application/alto+xml');
$settings->set('extractocr_content_store', $params['extractocr_content_store']);
$settings->set('extractocr_content_property', $params['extractocr_content_property']);
$settings->set('extractocr_content_language', $params['extractocr_content_language']);
$settings->set('extractocr_create_empty_xml', !empty($params['extractocr_create_empty_xml']));

// Form is already validated in parent.
$params = (array) $controller->getRequest()->getPost();
$params = array_intersect_key($params, ['override' => null, 'process' => null]);
$settings->set('extractocr_media_type', $data['extractocr_media_type'] ?: 'text/tab-separated-values');
$settings->set('extractocr_content_store', $data['extractocr_content_store']);
$settings->set('extractocr_content_property', $data['extractocr_content_property']);
$settings->set('extractocr_content_language', $data['extractocr_content_language']);
$settings->set('extractocr_create_empty_file', !empty($data['extractocr_create_empty_file']));

// Keep only values used in job.
$params = array_intersect_key($params->getArrayCopy(), [
'mode' => 'all',
'item_ids' => '',
'process' => null,
]);
if (empty($params['process']) || $params['process'] !== $controller->translate('Process')) {
$message = 'No job launched.'; // @translate
$controller->messenger()->addWarning($message);
return true;
}

$args = [];
$args['override'] = (bool) ($params['override'] ?? false);
$args['mode'] = $params['mode'] ?? 'all';
$args['baseUri'] = $this->getBaseUri();
$args['item_ids'] = $params['item_ids'] ?? '';

Expand All @@ -202,7 +225,9 @@ public function handleConfigForm(AbstractController $controller)
'</a>',
sprintf(
'<a href="%s">',
htmlspecialchars($controller->url()->fromRoute('admin/id', ['controller' => 'job', 'id' => $job->getId(), 'action' => 'log']))
class_exists('Log\Module')
? htmlspecialchars($controller->url()->fromRoute('admin/default', ['controller' => 'log'], ['query' => ['job_id' => $job->getId()]]))
: htmlspecialchars($controller->url()->fromRoute('admin/id', ['controller' => 'job', 'id' => $job->getId(), 'action' => 'log']))
)
);
$message->setEscapeHtml(false);
Expand All @@ -221,25 +246,34 @@ public function extractOcr(Event $event): void
/** @var \Omeka\Entity\Item $item */
$item = $response->getContent();

$settings = $this->getServiceLocator()->get('Omeka\Settings');
$targetMediaType = $settings->get('extractocr_media_type') ?? 'text/tab-separated-values';
$targetExtension = $targetMediaType === 'text/tab-separated-values' ? '.tsv' : '.xml';

$hasPdf = false;
$targetFilename = null;
/** @var \Omeka\Entity\Media $media */
foreach ($item->getMedia() as $media) {
if (strtolower((string) $media->getExtension()) === 'pdf'
&& $media->getMediaType() === 'application/pdf'
) {
$mediaType = $media->getMediaType();
$extension = strtolower((string) $media->getExtension());
if ($mediaType === 'application/pdf' && $extension === 'pdf') {
$hasPdf = true;
$source = (string) $media->getSource();
$filename = (string) parse_url($source, PHP_URL_PATH);
$targetFilename = strlen($filename)
? basename($filename, '.pdf')
: $media->id() . '-' . $media->getStorageId();
$targetFilename .= '.xml';
$targetFilename .= $targetExtension;
break;
}
}

if (!$hasPdf || $targetFilename === '.xml') {
if (!$hasPdf || $targetFilename === '.tsv' || $targetFilename === '.xml') {
return;
}

// Don't override an already processed pdf when updating an item.
if ($this->getMediaFromFilename($item->getId(), $targetFilename, 'tsv')) {
return;
}

Expand All @@ -249,7 +283,7 @@ public function extractOcr(Event $event): void
}

$params = [
'override' => false,
'mode' => 'missing',
'baseUri' => $this->getBaseUri(),
'itemId' => $item->getId(),
// FIXME Currently impossible to save text with event api.update.post;
Expand Down Expand Up @@ -325,7 +359,7 @@ protected function getBaseUri()
protected function checkDir($dirPath)
{
if (!file_exists($dirPath)) {
if (!is_writeable(basename($dirPath))) {
if (!is_writeable(dirname($dirPath))) {
return false;
}
@mkdir($dirPath, 0755, true);
Expand Down
25 changes: 16 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,16 @@ Extract OCR (plugin upgraded for Omeka S)
=========================================


Module for Omeka S to extract OCR text in XML from PDF files, allowing fulltext
searching within any IIIF viewer like Universal Viewer or Mirador with [IIIF-Search module](https://github.com/bubdxm/Omeka-S-module-IiifSearch)).
Module for Omeka S to extract OCR text in XML and TSV from PDF files, allowing
instant fulltext searching within any IIIF viewer like Universal Viewer or
Mirador with [IIIF-Search module](https://github.com/bubdxm/Omeka-S-module-IiifSearch)).

The xml format is the simple [pdf2xml](https://poppler.freedesktop.org) or the
most common standard [alto](https://www.loc.gov/standards/alto).
most common standard [alto](https://www.loc.gov/standards/alto). The tsv format
is a simple two columns with the words and the list of positions by page.

The tsv format is recommended as it is a lot quicker, in particular for items
with many pages.


Installation
Expand Down Expand Up @@ -54,9 +59,9 @@ Using the Extract OCR module
- Create an item
- Save this Item
- After save, add PDF file(s) to this item
- To locate extracted OCR xml file, select the item to which the PDF is
attached. Normally, you should see an XML file attached to the record with the
same filename than the pdf file.
- To locate extracted OCR xml or tsv file, select the item to which the PDF is
attached. Normally, you should see an XML or a tsv file attached to the record
with the same filename than the pdf file.


Optional modules
Expand All @@ -71,13 +76,15 @@ Optional modules
Module for Omeka S that includes UniversalViewer, a unified online player for
any file. It can display books, images, maps, audio, movies, pdf, 3D views,
and anything else as long as the appropriate extensions are installed.
Or any other IIIF viewers, like [Mirador](https://gitlab.com/Daniel-KM/Omeka-S-module-Mirador).
- [Mirador](https://gitlab.com/Daniel-KM/Omeka-S-module-Mirador)
- Or any other IIIF viewers, like [Diva](https://gitlab.com/Daniel-KM/Omeka-S-module-Mirador).


TODO
----

- [ ] Extract strings with pdftotext with arg -tsv and store them in a file or in database for simpler and quicker search.
- [x] Extract strings with pdftotext with arg -tsv and store them in a file or in database for simpler and quicker search.
- [ ] Extract strings by word, but with one position by row, allowing to search with "AND", not only "OR".


Troubleshooting
Expand Down Expand Up @@ -110,4 +117,4 @@ Copyright
---------

* Copyright Syvain Machefert, Université Bordeaux 3 (see [symac](https://github.com/symac))
* Copyright Daniel Berthereau, 2020-2023 (see [Daniel-KM](https://gitlab.com/Daniel-KM) on GitLab)
* Copyright Daniel Berthereau, 2020-2024 (see [Daniel-KM](https://gitlab.com/Daniel-KM) on GitLab)
10 changes: 5 additions & 5 deletions config/module.config.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@
],
'extractocr' => [
'config' => [
'extractocr_media_type' => 'application/alto+xml',
'extractocr_content_store' => [
'media_xml',
],
'extractocr_media_type' => 'text/tab-separated-values',
// Don't set a default option to avoid issue with config form.
'extractocr_content_store' => [],
'extractocr_content_property' => 'bibo:content',
'extractocr_content_language' => '',
'extractocr_create_empty_xml' => false,
// Create an empty file when a page does not have text.
'extractocr_create_empty_file' => false,
],
],
];
6 changes: 3 additions & 3 deletions config/module.ini
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
[info]
name = "Extract Ocr"
description = "Extract text layer from a pdf and attach it to item as a xml file, formatted with standard "alto" or simple "pdf2xml". The process uses the command line tool pdftohtml of poppler."
tags = "ocr, pdf, alto"
description = "Extract text layer from a pdf and attach it to item as a xml file, formatted with standard "alto" or simple "pdf2xml" or common "tsv". The process uses the command line tool pdftohtml of poppler."
tags = "ocr, pdf, alto, tsv"
license = "GPL-3.0-or-later"
author = "bubdxm, completed by Daniel Berthereau"
author_link = "https://github.com/symac"
module_link = "http://github.com/bubdxm/Omeka-S-module-ExtractOcr"
configurable = true
version = "3.4.5"
version = "3.4.6"
omeka_version_constraint = "^3.0.0 || ^4.0.0"
Binary file modified language/fr.mo
Binary file not shown.
Loading

0 comments on commit 713a959

Please sign in to comment.