From 75ad9f9266efc08a793d34cfb46106de4b92739f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 30 Nov 2020 21:33:43 +0100 Subject: [PATCH] support ocrmypdf#32 (#39) (#40) * First working version with OCRmyPDF #32 * Fix variable assignment * Use ProcessingFileAccessor to prevent infinite loop * Update README for OCRmyPDF * docs: update TOC * Update README + app compliance * Code compliance * Apply suggestions from code review Co-authored-by: Manuel Bentele Co-authored-by: R0Wi Co-authored-by: Manuel Bentele Co-authored-by: Robin Windey Co-authored-by: R0Wi Co-authored-by: Manuel Bentele --- README.md | 58 +- appinfo/info.xml | 2 +- composer.json | 5 +- composer.lock | 560 ++++++++---------- doc/diagramms/pdf.drawio | 1 - doc/diagramms/pdf.svg | 3 - lib/AppInfo/Application.php | 24 +- lib/BackgroundJobs/ProcessFileJob.php | 41 +- .../IProcessingFileAccessor.php} | 21 +- lib/Helper/ProcessingFileAccessor.php | 63 ++ lib/OcrProcessors/PdfOcrProcessor.php | 200 +------ lib/Operation.php | 32 +- ...eractOcrWrapper.php => CommandWrapper.php} | 53 +- lib/Wrapper/FpdiWrapper.php | 78 --- lib/Wrapper/ICommand.php | 80 +++ lib/Wrapper/IImagick.php | 43 -- lib/Wrapper/IPdfParser.php | 41 -- lib/Wrapper/ITesseractOcr.php | 37 -- lib/Wrapper/ImagickWrapper.php | 115 ---- lib/Wrapper/PdfParserWrapper.php | 46 -- lib/Wrapper/ViewWrapper.php | 2 +- tests/Integration/ViewWrapperTest.php | 89 +++ tests/Unit/AppInfo/ApplicationTest.php | 6 +- .../BackgroundJobs/ProcessFileJobTest.php | 67 ++- .../Helper/ProcessingFIleAccessorTest.php | 22 +- .../OcrProcessors/PdfOcrProcessorTest.php | 198 ++----- tests/Unit/OperationTest.php | 67 ++- .../Unit/Wrapper/CommandWrapperTest.php | 32 +- 28 files changed, 834 insertions(+), 1152 deletions(-) delete mode 100644 doc/diagramms/pdf.drawio delete mode 100644 doc/diagramms/pdf.svg rename lib/{Wrapper/WrapperFactory.php => Helper/IProcessingFileAccessor.php} (69%) create mode 100644 lib/Helper/ProcessingFileAccessor.php rename lib/Wrapper/{TesseractOcrWrapper.php => CommandWrapper.php} (51%) delete mode 100644 lib/Wrapper/FpdiWrapper.php create mode 100644 lib/Wrapper/ICommand.php delete mode 100644 lib/Wrapper/IImagick.php delete mode 100644 lib/Wrapper/IPdfParser.php delete mode 100644 lib/Wrapper/ITesseractOcr.php delete mode 100644 lib/Wrapper/ImagickWrapper.php delete mode 100644 lib/Wrapper/PdfParserWrapper.php create mode 100644 tests/Integration/ViewWrapperTest.php rename lib/Wrapper/IWrapperFactory.php => tests/Unit/Helper/ProcessingFIleAccessorTest.php (59%) rename lib/Wrapper/IFpdi.php => tests/Unit/Wrapper/CommandWrapperTest.php (55%) diff --git a/README.md b/README.md index 3650e71..3cea73f 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,6 @@ - [App installation](#app-installation) - [Nextcloud background jobs](#nextcloud-background-jobs) - [Backend](#backend) - - [Imagick](#imagick) - - [Tesseract](#tesseract) - [Usage](#usage) - [How it works](#how-it-works) - [General](#general) @@ -44,46 +42,23 @@ Since the actual processing of the files is done asynchronously via Nextcloud's ### Backend -#### Imagick -Make sure `Imagick` is installed (the command below is for debian based Linux systems. It might be different on your system.). -```bash -sudo apt-get install php-imagick -``` +> :warning: Since `v1.20.1` you'll have to install `OCRmyPDF`. -Make sure `Imagick` is properly configured so that it can access pdf files. On debian based systems edit the configuration file `/etc/ImageMagick-6/policy.xml` (path might be different on your system). It has to contain at least this line: -```xml - - - - - - -``` -If you use **any other background job setting than [`cron`](https://docs.nextcloud.com/server/latest/admin_manual/configuration_server/background_jobs_configuration.html#cron)** you'll have to restart your php environment for the above changes to be applied. Depending on your system this is usually done by restarting your `php-fpm`-daemon or webserver, for example: +In the backend [`OCRmyPDF`](https://github.com/jbarlow83/OCRmyPDF) is used for processing PDF files. Make sure you have this commandline tool installed. ```bash -# Restart php-fpm -sudo systemctl restart php7.3-fpm.service - -# Restart Apache webserver -sudo systemctl restart apache2 -``` - -You can find additional information about `Imagick` [here](https://www.php.net/manual/en/imagick.setup.php). +apt-get install ocrmypdf +``` -> :warning: **Note that `Imagick` requires [Ghostscript](https://www.ghostscript.com) to properly read PDF files. You can find more details in the section [Supported Image Formats](https://imagemagick.org/script/formats.php#supported) of `Imagick`'s documentation.** +Also if you want to use specific language settings please install the corresponding `tesseract` packages. -#### Tesseract -For the OCR part the commandlinetool `tesseract` is used. Make sure you have the library and appropriate languages installed. I recommend installing the packages from [PPA](https://github.com/tesseract-ocr/tessdoc/blob/master/Home.md) because they're newer than the official package-sources (i tested with `tesseract 4.1.1`). On Ubuntu 18.04 you might type the following for languages english and german: ```bash -# Install PPA -sudo add-apt-repository ppa:alex-p/tesseract-ocr -sudo apt-get update +# English +apt-get install tesseract-ocr-eng -# Install Tesseract and language-files -sudo apt-get install tesseract-ocr tesseract-ocr-deu tesseract-ocr-eng +# German +apt-get install tesseract-ocr-deu ``` -You can read more about the installation of `tesseract` [here](https://github.com/tesseract-ocr/tesseract/wiki). ## Usage You can configure the OCR processing via Nextcloud's workflow engine. Therefore configure a new flow via `Settings -> Flow -> Add new flow` (if you don't see `OCR file` here the app isn't installed properly or you forgot to activate it). @@ -113,11 +88,7 @@ To **test** if your file gets processed properly you can do the following steps:

### PDF -

- PDF diagramm -

- -**Note on PDF processing:** since the processing algorithm for PDF files makes heavy use of splitting an recombining the single PDF pages, it could damage certain PDF files or manipulate the content somehow. +For processing PDF files, the external command line tool [`OCRmyPDF`](https://github.com/jbarlow83/OCRmyPDF) is used. The tool is invoked with the [`--redo-ocr`](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped) parameter so that it will perform a detailed text analysis. The detailed analysis masks out visible text and sends the image of each page to the OCR processor. After processing, additional text is inserted as OCR, whereas existing text in a mixed file document (images embedded into text pages) is not disrupted. ## Development ### Dev setup @@ -232,11 +203,6 @@ That's all. If you now create a new workflow based on your added mimetype, your ## Used libraries & components | Name | Version | Link | |---|---|---| -| tesseract_ocr | >= 2.9 | https://github.com/thiagoalessio/tesseract-ocr-for-php | -| tesseract (commandline) | >= 4.0 | https://github.com/tesseract-ocr/tesseract | -| pdfparser | >= 0.15.0 | https://www.pdfparser.org/ | -| fpdi | >= 2.3 | https://www.setasign.com/products/fpdi/about/ | -| fpdf | >= 1.8 | http://www.fpdf.org/ | -| imagick php extension | >= 2 | https://www.php.net/manual/de/book.imagick.php | -| Ghostscript | >= 9.0 | https://www.ghostscript.com/ | +| OCRmyPDF (commandline) | >= 9.6.0 | https://github.com/jbarlow83/OCRmyPDF | +| php-shellcommand | >= 1.6 | https://github.com/mikehaertl/php-shellcommand | | PHPUnit | >= 8.0 | https://phpunit.de/ | diff --git a/appinfo/info.xml b/appinfo/info.xml index a91b54d..131f5d1 100644 --- a/appinfo/info.xml +++ b/appinfo/info.xml @@ -6,7 +6,7 @@ OCR processing via workflow This app makes it possible to process various files via OCR algorithms. The processing is done via workflow-engine and can therefore easily be customized. - 1.20.0 + 1.20.1 agpl Robin Windey WorkflowOcr diff --git a/composer.json b/composer.json index 34d527a..af7c221 100644 --- a/composer.json +++ b/composer.json @@ -1,9 +1,6 @@ { "require": { - "thiagoalessio/tesseract_ocr": "^2.9", - "smalot/pdfparser": "^0.15.0", - "setasign/fpdi": "^2.3", - "setasign/fpdf": "^1.8" + "mikehaertl/php-shellcommand": "^1.6" }, "require-dev": { "phpunit/phpunit": "^8.0", diff --git a/composer.lock b/composer.lock index 6840ce6..b073a11 100644 --- a/composer.lock +++ b/composer.lock @@ -1,250 +1,35 @@ { "_readme": [ "This file locks the dependencies of your project to a known state", - "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file", + "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "51f9fbef2691c0fe41d0ad74f8376124", + "content-hash": "e55f55fa4c0a53d009aa03e19fb4655f", "packages": [ { - "name": "setasign/fpdf", - "version": "1.8.2", + "name": "mikehaertl/php-shellcommand", + "version": "1.6.2", "source": { "type": "git", - "url": "https://github.com/Setasign/FPDF.git", - "reference": "d77904018090c17dc9f3ab6e944679a7a47e710a" + "url": "https://github.com/mikehaertl/php-shellcommand.git", + "reference": "06d6220c77c4632b639f4855f76026c59bceb8aa" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/Setasign/FPDF/zipball/d77904018090c17dc9f3ab6e944679a7a47e710a", - "reference": "d77904018090c17dc9f3ab6e944679a7a47e710a", - "shasum": "" - }, - "type": "library", - "autoload": { - "classmap": [ - "fpdf.php" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "MIT" - ], - "authors": [ - { - "name": "Olivier Plathey", - "email": "oliver@fpdf.org", - "homepage": "http://fpdf.org/" - } - ], - "description": "FPDF is a PHP class which allows to generate PDF files with pure PHP. F from FPDF stands for Free: you may use it for any kind of usage and modify it to suit your needs.", - "homepage": "http://www.fpdf.org", - "keywords": [ - "fpdf", - "pdf" - ], - "time": "2019-12-08T10:32:10+00:00" - }, - { - "name": "setasign/fpdi", - "version": "v2.3.4", - "source": { - "type": "git", - "url": "https://github.com/Setasign/FPDI.git", - "reference": "2b5fb811c04f937ef257ef3f798cebeded33c136" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/Setasign/FPDI/zipball/2b5fb811c04f937ef257ef3f798cebeded33c136", - "reference": "2b5fb811c04f937ef257ef3f798cebeded33c136", - "shasum": "" - }, - "require": { - "ext-zlib": "*", - "php": "^5.6 || ^7.0" - }, - "conflict": { - "setasign/tfpdf": "<1.31" - }, - "require-dev": { - "phpunit/phpunit": "~5.7", - "setasign/fpdf": "~1.8", - "setasign/tfpdf": "1.31", - "squizlabs/php_codesniffer": "^3.5", - "tecnickcom/tcpdf": "~6.2" - }, - "suggest": { - "setasign/fpdf": "FPDI will extend this class but as it is also possible to use TCPDF or tFPDF as an alternative. There's no fixed dependency configured." - }, - "type": "library", - "autoload": { - "psr-4": { - "setasign\\Fpdi\\": "src/" - } - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "MIT" - ], - "authors": [ - { - "name": "Jan Slabon", - "email": "jan.slabon@setasign.com", - "homepage": "https://www.setasign.com" - }, - { - "name": "Maximilian Kresse", - "email": "maximilian.kresse@setasign.com", - "homepage": "https://www.setasign.com" - } - ], - "description": "FPDI is a collection of PHP classes facilitating developers to read pages from existing PDF documents and use them as templates in FPDF. Because it is also possible to use FPDI with TCPDF, there are no fixed dependencies defined. Please see suggestions for packages which evaluates the dependencies automatically.", - "homepage": "https://www.setasign.com/fpdi", - "keywords": [ - "fpdf", - "fpdi", - "pdf" - ], - "time": "2020-08-27T06:55:47+00:00" - }, - { - "name": "smalot/pdfparser", - "version": "v0.15.1", - "source": { - "type": "git", - "url": "https://github.com/smalot/pdfparser.git", - "reference": "6bc9dcbab5154f7d9f4c99e9cd3391f7ba019dc1" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/smalot/pdfparser/zipball/6bc9dcbab5154f7d9f4c99e9cd3391f7ba019dc1", - "reference": "6bc9dcbab5154f7d9f4c99e9cd3391f7ba019dc1", - "shasum": "" - }, - "require": { - "ext-mbstring": "*", - "ext-zlib": "*", - "php": "^5.6|^7.0", - "tecnickcom/tcpdf": "^6.2.22" - }, - "require-dev": { - "atoum/atoum": "^3.1", - "friendsofphp/php-cs-fixer": "^2.16.3" - }, - "type": "library", - "autoload": { - "psr-0": { - "Smalot\\PdfParser\\": "src/" - } - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "LGPL-3.0" - ], - "authors": [ - { - "name": "Sebastien MALOT", - "email": "sebastien@malot.fr" - } - ], - "description": "Pdf parser library. Can read and extract information from pdf file.", - "homepage": "http://www.pdfparser.org", - "keywords": [ - "extract", - "parse", - "parser", - "pdf", - "text" - ], - "time": "2020-05-27T07:55:41+00:00" - }, - { - "name": "tecnickcom/tcpdf", - "version": "6.3.5", - "source": { - "type": "git", - "url": "https://github.com/tecnickcom/TCPDF.git", - "reference": "19a535eaa7fb1c1cac499109deeb1a7a201b4549" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/tecnickcom/TCPDF/zipball/19a535eaa7fb1c1cac499109deeb1a7a201b4549", - "reference": "19a535eaa7fb1c1cac499109deeb1a7a201b4549", - "shasum": "" - }, - "require": { - "php": ">=5.3.0" - }, - "type": "library", - "autoload": { - "classmap": [ - "config", - "include", - "tcpdf.php", - "tcpdf_parser.php", - "tcpdf_import.php", - "tcpdf_barcodes_1d.php", - "tcpdf_barcodes_2d.php", - "include/tcpdf_colors.php", - "include/tcpdf_filters.php", - "include/tcpdf_font_data.php", - "include/tcpdf_fonts.php", - "include/tcpdf_images.php", - "include/tcpdf_static.php", - "include/barcodes/datamatrix.php", - "include/barcodes/pdf417.php", - "include/barcodes/qrcode.php" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "LGPL-3.0-only" - ], - "authors": [ - { - "name": "Nicola Asuni", - "email": "info@tecnick.com", - "role": "lead" - } - ], - "description": "TCPDF is a PHP class for generating PDF documents and barcodes.", - "homepage": "http://www.tcpdf.org/", - "keywords": [ - "PDFD32000-2008", - "TCPDF", - "barcodes", - "datamatrix", - "pdf", - "pdf417", - "qrcode" - ], - "time": "2020-02-14T14:20:12+00:00" - }, - { - "name": "thiagoalessio/tesseract_ocr", - "version": "2.9.3", - "source": { - "type": "git", - "url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git", - "reference": "e932f7410e753434b26b214f3f322933efafa0f0" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/e932f7410e753434b26b214f3f322933efafa0f0", - "reference": "e932f7410e753434b26b214f3f322933efafa0f0", + "url": "https://api.github.com/repos/mikehaertl/php-shellcommand/zipball/06d6220c77c4632b639f4855f76026c59bceb8aa", + "reference": "06d6220c77c4632b639f4855f76026c59bceb8aa", "shasum": "" }, "require": { - "php": "^5.4 || ^7.0" + "php": ">= 5.4.0" }, "require-dev": { - "codacy/coverage": "dev-master", - "phpunit/php-code-coverage": "^2.2.4" + "phpunit/phpunit": ">4.0 <8" }, "type": "library", "autoload": { "psr-4": { - "thiagoalessio\\TesseractOCR\\": "src/" + "mikehaertl\\shellcommand\\": "src/" } }, "notification-url": "https://packagist.org/downloads/", @@ -253,32 +38,30 @@ ], "authors": [ { - "name": "thiagoalessio", - "email": "thiagoalessio@me.com" + "name": "Michael Härtl", + "email": "haertl.mike@gmail.com" } ], - "description": "A wrapper to work with Tesseract OCR inside PHP.", + "description": "An object oriented interface to shell commands", "keywords": [ - "OCR", - "Tesseract", - "text recognition" + "shell" ], - "time": "2020-01-27T19:53:35+00:00" + "time": "2020-08-30T09:56:40+00:00" } ], "packages-dev": [ { "name": "composer/semver", - "version": "1.5.1", + "version": "1.7.1", "source": { "type": "git", "url": "https://github.com/composer/semver.git", - "reference": "c6bea70230ef4dd483e6bbcab6005f682ed3a8de" + "reference": "38276325bd896f90dfcfe30029aa5db40df387a7" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/composer/semver/zipball/c6bea70230ef4dd483e6bbcab6005f682ed3a8de", - "reference": "c6bea70230ef4dd483e6bbcab6005f682ed3a8de", + "url": "https://api.github.com/repos/composer/semver/zipball/38276325bd896f90dfcfe30029aa5db40df387a7", + "reference": "38276325bd896f90dfcfe30029aa5db40df387a7", "shasum": "" }, "require": { @@ -326,7 +109,21 @@ "validation", "versioning" ], - "time": "2020-01-13T12:06:48+00:00" + "funding": [ + { + "url": "https://packagist.com", + "type": "custom" + }, + { + "url": "https://github.com/composer", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/composer/composer", + "type": "tidelift" + } + ], + "time": "2020-09-27T13:13:07+00:00" }, { "name": "composer/xdebug-handler", @@ -738,20 +535,20 @@ }, { "name": "paragonie/random_compat", - "version": "v9.99.99", + "version": "v9.99.100", "source": { "type": "git", "url": "https://github.com/paragonie/random_compat.git", - "reference": "84b4dfb120c6f9b4ff7b3685f9b8f1aa365a0c95" + "reference": "996434e5492cb4c3edcb9168db6fbb1359ef965a" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/paragonie/random_compat/zipball/84b4dfb120c6f9b4ff7b3685f9b8f1aa365a0c95", - "reference": "84b4dfb120c6f9b4ff7b3685f9b8f1aa365a0c95", + "url": "https://api.github.com/repos/paragonie/random_compat/zipball/996434e5492cb4c3edcb9168db6fbb1359ef965a", + "reference": "996434e5492cb4c3edcb9168db6fbb1359ef965a", "shasum": "" }, "require": { - "php": "^7" + "php": ">= 7" }, "require-dev": { "phpunit/phpunit": "4.*|5.*", @@ -779,7 +576,7 @@ "pseudorandom", "random" ], - "time": "2018-07-02T15:55:56+00:00" + "time": "2020-10-15T08:29:30+00:00" }, { "name": "phar-io/manifest", @@ -885,23 +682,23 @@ }, { "name": "php-cs-fixer/diff", - "version": "v1.3.0", + "version": "v1.3.1", "source": { "type": "git", "url": "https://github.com/PHP-CS-Fixer/diff.git", - "reference": "78bb099e9c16361126c86ce82ec4405ebab8e756" + "reference": "dbd31aeb251639ac0b9e7e29405c1441907f5759" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/PHP-CS-Fixer/diff/zipball/78bb099e9c16361126c86ce82ec4405ebab8e756", - "reference": "78bb099e9c16361126c86ce82ec4405ebab8e756", + "url": "https://api.github.com/repos/PHP-CS-Fixer/diff/zipball/dbd31aeb251639ac0b9e7e29405c1441907f5759", + "reference": "dbd31aeb251639ac0b9e7e29405c1441907f5759", "shasum": "" }, "require": { - "php": "^5.6 || ^7.0" + "php": "^5.6 || ^7.0 || ^8.0" }, "require-dev": { - "phpunit/phpunit": "^5.7.23 || ^6.4.3", + "phpunit/phpunit": "^5.7.23 || ^6.4.3 || ^7.0", "symfony/process": "^3.3" }, "type": "library", @@ -915,14 +712,14 @@ "BSD-3-Clause" ], "authors": [ - { - "name": "Kore Nordmann", - "email": "mail@kore-nordmann.de" - }, { "name": "Sebastian Bergmann", "email": "sebastian@phpunit.de" }, + { + "name": "Kore Nordmann", + "email": "mail@kore-nordmann.de" + }, { "name": "SpacePossum" } @@ -932,7 +729,7 @@ "keywords": [ "diff" ], - "time": "2018-02-15T16:58:55+00:00" + "time": "2020-10-14T08:39:05+00:00" }, { "name": "phpdocumentor/reflection-common", @@ -985,16 +782,16 @@ }, { "name": "phpdocumentor/reflection-docblock", - "version": "5.2.1", + "version": "5.2.2", "source": { "type": "git", "url": "https://github.com/phpDocumentor/ReflectionDocBlock.git", - "reference": "d870572532cd70bc3fab58f2e23ad423c8404c44" + "reference": "069a785b2141f5bcf49f3e353548dc1cce6df556" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/phpDocumentor/ReflectionDocBlock/zipball/d870572532cd70bc3fab58f2e23ad423c8404c44", - "reference": "d870572532cd70bc3fab58f2e23ad423c8404c44", + "url": "https://api.github.com/repos/phpDocumentor/ReflectionDocBlock/zipball/069a785b2141f5bcf49f3e353548dc1cce6df556", + "reference": "069a785b2141f5bcf49f3e353548dc1cce6df556", "shasum": "" }, "require": { @@ -1033,20 +830,20 @@ } ], "description": "With this component, a library can provide support for annotations via DocBlocks or otherwise retrieve information that is embedded in a DocBlock.", - "time": "2020-08-15T11:14:08+00:00" + "time": "2020-09-03T19:13:55+00:00" }, { "name": "phpdocumentor/type-resolver", - "version": "1.3.0", + "version": "1.4.0", "source": { "type": "git", "url": "https://github.com/phpDocumentor/TypeResolver.git", - "reference": "e878a14a65245fbe78f8080eba03b47c3b705651" + "reference": "6a467b8989322d92aa1c8bf2bebcc6e5c2ba55c0" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/phpDocumentor/TypeResolver/zipball/e878a14a65245fbe78f8080eba03b47c3b705651", - "reference": "e878a14a65245fbe78f8080eba03b47c3b705651", + "url": "https://api.github.com/repos/phpDocumentor/TypeResolver/zipball/6a467b8989322d92aa1c8bf2bebcc6e5c2ba55c0", + "reference": "6a467b8989322d92aa1c8bf2bebcc6e5c2ba55c0", "shasum": "" }, "require": { @@ -1078,32 +875,32 @@ } ], "description": "A PSR-5 based resolver of Class names, Types and Structural Element Names", - "time": "2020-06-27T10:12:23+00:00" + "time": "2020-09-17T18:55:26+00:00" }, { "name": "phpspec/prophecy", - "version": "1.11.1", + "version": "1.12.1", "source": { "type": "git", "url": "https://github.com/phpspec/prophecy.git", - "reference": "b20034be5efcdab4fb60ca3a29cba2949aead160" + "reference": "8ce87516be71aae9b956f81906aaf0338e0d8a2d" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/phpspec/prophecy/zipball/b20034be5efcdab4fb60ca3a29cba2949aead160", - "reference": "b20034be5efcdab4fb60ca3a29cba2949aead160", + "url": "https://api.github.com/repos/phpspec/prophecy/zipball/8ce87516be71aae9b956f81906aaf0338e0d8a2d", + "reference": "8ce87516be71aae9b956f81906aaf0338e0d8a2d", "shasum": "" }, "require": { "doctrine/instantiator": "^1.2", - "php": "^7.2", - "phpdocumentor/reflection-docblock": "^5.0", + "php": "^7.2 || ~8.0, <8.1", + "phpdocumentor/reflection-docblock": "^5.2", "sebastian/comparator": "^3.0 || ^4.0", "sebastian/recursion-context": "^3.0 || ^4.0" }, "require-dev": { "phpspec/phpspec": "^6.0", - "phpunit/phpunit": "^8.0" + "phpunit/phpunit": "^8.0 || ^9.0 <9.3" }, "type": "library", "extra": { @@ -1141,7 +938,7 @@ "spy", "stub" ], - "time": "2020-07-08T12:44:21+00:00" + "time": "2020-09-29T09:10:42+00:00" }, { "name": "phpunit/php-code-coverage", @@ -1393,6 +1190,7 @@ "keywords": [ "tokenizer" ], + "abandoned": true, "time": "2019-09-17T06:23:10+00:00" }, { @@ -2287,16 +2085,16 @@ }, { "name": "symfony/console", - "version": "v4.4.11", + "version": "v4.4.15", "source": { "type": "git", "url": "https://github.com/symfony/console.git", - "reference": "55d07021da933dd0d633ffdab6f45d5b230c7e02" + "reference": "90933b39c7b312fc3ceaa1ddeac7eb48cb953124" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/console/zipball/55d07021da933dd0d633ffdab6f45d5b230c7e02", - "reference": "55d07021da933dd0d633ffdab6f45d5b230c7e02", + "url": "https://api.github.com/repos/symfony/console/zipball/90933b39c7b312fc3ceaa1ddeac7eb48cb953124", + "reference": "90933b39c7b312fc3ceaa1ddeac7eb48cb953124", "shasum": "" }, "require": { @@ -2360,20 +2158,34 @@ ], "description": "Symfony Console Component", "homepage": "https://symfony.com", - "time": "2020-07-06T13:18:39+00:00" + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2020-09-15T07:58:55+00:00" }, { "name": "symfony/deprecation-contracts", - "version": "v2.1.3", + "version": "v2.2.0", "source": { "type": "git", "url": "https://github.com/symfony/deprecation-contracts.git", - "reference": "5e20b83385a77593259c9f8beb2c43cd03b2ac14" + "reference": "5fa56b4074d1ae755beb55617ddafe6f5d78f665" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/deprecation-contracts/zipball/5e20b83385a77593259c9f8beb2c43cd03b2ac14", - "reference": "5e20b83385a77593259c9f8beb2c43cd03b2ac14", + "url": "https://api.github.com/repos/symfony/deprecation-contracts/zipball/5fa56b4074d1ae755beb55617ddafe6f5d78f665", + "reference": "5fa56b4074d1ae755beb55617ddafe6f5d78f665", "shasum": "" }, "require": { @@ -2382,7 +2194,7 @@ "type": "library", "extra": { "branch-alias": { - "dev-master": "2.1-dev" + "dev-master": "2.2-dev" }, "thanks": { "name": "symfony/contracts", @@ -2410,20 +2222,34 @@ ], "description": "A generic function and convention to trigger deprecation notices", "homepage": "https://symfony.com", - "time": "2020-06-06T08:49:21+00:00" + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2020-09-07T11:33:47+00:00" }, { "name": "symfony/event-dispatcher", - "version": "v4.4.11", + "version": "v4.4.15", "source": { "type": "git", "url": "https://github.com/symfony/event-dispatcher.git", - "reference": "6140fc7047dafc5abbe84ba16a34a86c0b0229b8" + "reference": "e17bb5e0663dc725f7cdcafc932132735b4725cd" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/event-dispatcher/zipball/6140fc7047dafc5abbe84ba16a34a86c0b0229b8", - "reference": "6140fc7047dafc5abbe84ba16a34a86c0b0229b8", + "url": "https://api.github.com/repos/symfony/event-dispatcher/zipball/e17bb5e0663dc725f7cdcafc932132735b4725cd", + "reference": "e17bb5e0663dc725f7cdcafc932132735b4725cd", "shasum": "" }, "require": { @@ -2441,6 +2267,7 @@ "psr/log": "~1.0", "symfony/config": "^3.4|^4.0|^5.0", "symfony/dependency-injection": "^3.4|^4.0|^5.0", + "symfony/error-handler": "~3.4|~4.4", "symfony/expression-language": "^3.4|^4.0|^5.0", "symfony/http-foundation": "^3.4|^4.0|^5.0", "symfony/service-contracts": "^1.1|^2", @@ -2480,7 +2307,21 @@ ], "description": "Symfony EventDispatcher Component", "homepage": "https://symfony.com", - "time": "2020-06-18T17:59:13+00:00" + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2020-09-18T14:07:46+00:00" }, { "name": "symfony/event-dispatcher-contracts", @@ -2546,16 +2387,16 @@ }, { "name": "symfony/filesystem", - "version": "v5.1.3", + "version": "v5.1.7", "source": { "type": "git", "url": "https://github.com/symfony/filesystem.git", - "reference": "6e4320f06d5f2cce0d96530162491f4465179157" + "reference": "1a8697545a8d87b9f2f6b1d32414199cc5e20aae" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/filesystem/zipball/6e4320f06d5f2cce0d96530162491f4465179157", - "reference": "6e4320f06d5f2cce0d96530162491f4465179157", + "url": "https://api.github.com/repos/symfony/filesystem/zipball/1a8697545a8d87b9f2f6b1d32414199cc5e20aae", + "reference": "1a8697545a8d87b9f2f6b1d32414199cc5e20aae", "shasum": "" }, "require": { @@ -2592,20 +2433,34 @@ ], "description": "Symfony Filesystem Component", "homepage": "https://symfony.com", - "time": "2020-05-30T20:35:19+00:00" + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2020-09-27T14:02:37+00:00" }, { "name": "symfony/finder", - "version": "v5.1.3", + "version": "v5.1.7", "source": { "type": "git", "url": "https://github.com/symfony/finder.git", - "reference": "4298870062bfc667cb78d2b379be4bf5dec5f187" + "reference": "2c3ba7ad6884e6c4451ce2340e2dc23f6fa3e0d8" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/finder/zipball/4298870062bfc667cb78d2b379be4bf5dec5f187", - "reference": "4298870062bfc667cb78d2b379be4bf5dec5f187", + "url": "https://api.github.com/repos/symfony/finder/zipball/2c3ba7ad6884e6c4451ce2340e2dc23f6fa3e0d8", + "reference": "2c3ba7ad6884e6c4451ce2340e2dc23f6fa3e0d8", "shasum": "" }, "require": { @@ -2641,20 +2496,34 @@ ], "description": "Symfony Finder Component", "homepage": "https://symfony.com", - "time": "2020-05-20T17:43:50+00:00" + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2020-09-02T16:23:27+00:00" }, { "name": "symfony/options-resolver", - "version": "v5.1.3", + "version": "v5.1.7", "source": { "type": "git", "url": "https://github.com/symfony/options-resolver.git", - "reference": "9ff59517938f88d90b6e65311fef08faa640f681" + "reference": "4c7e155bf7d93ea4ba3824d5a14476694a5278dd" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/options-resolver/zipball/9ff59517938f88d90b6e65311fef08faa640f681", - "reference": "9ff59517938f88d90b6e65311fef08faa640f681", + "url": "https://api.github.com/repos/symfony/options-resolver/zipball/4c7e155bf7d93ea4ba3824d5a14476694a5278dd", + "reference": "4c7e155bf7d93ea4ba3824d5a14476694a5278dd", "shasum": "" }, "require": { @@ -2697,7 +2566,21 @@ "configuration", "options" ], - "time": "2020-07-12T12:58:00+00:00" + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2020-09-27T03:44:28+00:00" }, { "name": "symfony/polyfill-ctype", @@ -3076,16 +2959,16 @@ }, { "name": "symfony/process", - "version": "v5.1.3", + "version": "v5.1.7", "source": { "type": "git", "url": "https://github.com/symfony/process.git", - "reference": "1864216226af21eb76d9477f691e7cbf198e0402" + "reference": "d3a2e64866169586502f0cd9cab69135ad12cee9" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/process/zipball/1864216226af21eb76d9477f691e7cbf198e0402", - "reference": "1864216226af21eb76d9477f691e7cbf198e0402", + "url": "https://api.github.com/repos/symfony/process/zipball/d3a2e64866169586502f0cd9cab69135ad12cee9", + "reference": "d3a2e64866169586502f0cd9cab69135ad12cee9", "shasum": "" }, "require": { @@ -3122,20 +3005,34 @@ ], "description": "Symfony Process Component", "homepage": "https://symfony.com", - "time": "2020-07-23T08:36:24+00:00" + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2020-09-02T16:23:27+00:00" }, { "name": "symfony/service-contracts", - "version": "v2.1.3", + "version": "v2.2.0", "source": { "type": "git", "url": "https://github.com/symfony/service-contracts.git", - "reference": "58c7475e5457c5492c26cc740cc0ad7464be9442" + "reference": "d15da7ba4957ffb8f1747218be9e1a121fd298a1" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/service-contracts/zipball/58c7475e5457c5492c26cc740cc0ad7464be9442", - "reference": "58c7475e5457c5492c26cc740cc0ad7464be9442", + "url": "https://api.github.com/repos/symfony/service-contracts/zipball/d15da7ba4957ffb8f1747218be9e1a121fd298a1", + "reference": "d15da7ba4957ffb8f1747218be9e1a121fd298a1", "shasum": "" }, "require": { @@ -3148,7 +3045,7 @@ "type": "library", "extra": { "branch-alias": { - "dev-master": "2.1-dev" + "dev-master": "2.2-dev" }, "thanks": { "name": "symfony/contracts", @@ -3184,11 +3081,25 @@ "interoperability", "standards" ], - "time": "2020-07-06T13:23:11+00:00" + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2020-09-07T11:33:47+00:00" }, { "name": "symfony/stopwatch", - "version": "v5.1.3", + "version": "v5.1.7", "source": { "type": "git", "url": "https://github.com/symfony/stopwatch.git", @@ -3234,6 +3145,20 @@ ], "description": "Symfony Stopwatch Component", "homepage": "https://symfony.com", + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], "time": "2020-05-20T17:43:50+00:00" }, { @@ -3372,5 +3297,6 @@ "prefer-stable": false, "prefer-lowest": false, "platform": [], - "platform-dev": [] + "platform-dev": [], + "plugin-api-version": "1.1.0" } diff --git a/doc/diagramms/pdf.drawio b/doc/diagramms/pdf.drawio deleted file mode 100644 index 0d6623e..0000000 --- a/doc/diagramms/pdf.drawio +++ /dev/null @@ -1 +0,0 @@ -7Vtbc9o4FP41zD6F8QUbeGwgabdt0nST3bZPGQULo1a2WFkOOL9+JVu2MVIMSTAGtkxmgo7l23fOd24SHXsULN9TMJ9dEQ/ijmV4y4497liWadsO/yckSSYZWHYm8Cny5KRScIueoBQaUhojD0aViYwQzNC8KpyQMIQTVpEBSsmiOm1KcPWuc+BDRXA7AViVfkMem0mpYxjlgQ8Q+TN5a8cy8kMByKdLQTQDHlmsiOyLjj2ihLDsW7AcQSzgy5H59mfyDX/+5b7/+DX6F/x9/unu+p+z7GKXLzmleAkKQ/b6Sy9NOH9iw59fyVncWwb4xvx41pevxpIcMehxAOWQUDYjPgkBviil55TEoQfFVQ0+Kud8JmTOhSYX/oSMJdIaQMwIF81YgOXR7J7iRmtK2vB+cl5EYjqBNfOkiTJAfchqXt4tlMjtH5IAMprw8yjEgKHH6sMBaYh+Ma+Emn+RaL9AqfIhHwGO5Z1uKJnAKOLCm/GlopcSdQHhYoYYvJ2DFIcF524V4SkJmYTf5K947mMQRVJhEaPkV8EGMbswbKNQzra6eISUweUKUiqc8uhAsijJHYQcL0paDqVotkLInvF2/N0PyYgs7+zb5adr3zKeLkbW1Zlpt2H6HFSafJfnp4MfYtB18uF4uXpwnOSjJWLfy5l89GPlSHmSGCSv0uRGVrkqq7TA2lp6v4Fm6anvKAXJyoQ5QSGLVq58IwSlubl21d56zprFZBcs7ad4sjeYVO/4Tco8RJPqHaRJWYaxB5tylTgREq2ZfQYPPHmrmAbAyA/59wlXF6RcINw14tnRO3kgQJ6XWSGM0BN4SK8nFC2x4Bd3zjvOWBNSVFOo5YQSKIqcT961klTpAojRtfknO/VlilY0aVpdt2+sfKyKYs/MtaBDptOIG+h6yHmZeusYsKLdS0KFqYPJrEhv33UENC7mOJ8/8KOuL755BEb5FP6omELgJdqZHGgGUCiYBpdcoEkuZiR4iKPNiUUlVRAmcQkChAVuHyB+hMK4NLai2KEmB+G3RKHPR245uktdIKd/c2mJ41ZJ7appidnT5CWm1VhiYrUSRWQ0MFdiQRkZ9NHgtZHn9VFEC5ihSfdruLa7KPImNRsK82/nGAknxV0VEe/NCQAlwaOTKwd6xuZ6oODYfgqCVmrhffIuHd1Aijhgwg0XTrw+rr+JnJpaXDvPOShyqsX5mIR/sLR7FZeROc1hmADq1OjZt9ZyXadtdg5+s3P37Oxtyc7+QbEzLylKdn4Z/VVw8sSZaA7aDpRFP/zwqcgHe2aUsyWjhgfFKEdhlKLivHWQthluSIQYIqKmeyCMkaCqq/U2AxM6fq4KHBFMhHJCEsKMVHNxy2DpiyWpLniKKexOMIm9+wjSRzSB0T0Pu1PkxxSIh7ifotTE0p4FpBePMGtdpIRGGOc36Fh2Vu03yNbBcNDtO9XQqfK11+86KmGdpgirNhm+UOQjzr7nnGbWJ2iqlSTIHUkKNqmLfgGzVEVfzWJ0aw7NlfbDY/acZWJT5jJFy7gmsWnY4/aP0uP2f3vcxjyuqeH5nl3uYLPL1TRqOxb/M7JkNnXBloFBwjX3P/HP2jLT0Hhou7Hc1lSwPlQPfURl5nBLF52vuxyIjx4qJL6CVK625Dw+vc7s0G2xM6vdp9POTo2dZ0074KmWHjskqqlbS9GpxGqTl6a6dvI7eXoV103TMrtWNQgPW0+eTFPR7zVcdOS+OMMjkzjQNd1PIisyjcFaNtvbsvfeXNmqtocSTeA7CfjdgVlNSVXw7Yaw1zraVvLRHYYZ7Uttu2Jv7ynK1D1kXZCpZmhWZ32rTCUyTDFZTGaAsm7EX5zdW89kh89Y7Lbw1ziWtbSucOqbbNtpCGGrJYTro2wDyFv5LrjDwV5dQztN7G1l33m7uB/NMnbTMWDbvb/PbArdTwxody/eSzb7v7Y534Ka9Ui3qmd1hWyc7a7NKo5iA63asHVBINxa+BClNWPasoUgEv9FgVjT7F0gbhhxuqfoyHfl7t5rW2u7A3W7ch2N297Jrly9fTrH4rgPteFUQ/st/EOv1TCg1gLr+wEXM4Jh4TDSFo9C5702hhsgpbm2WqPJYXfVGObD8oe12Y8tyh8o2xf/AQ== \ No newline at end of file diff --git a/doc/diagramms/pdf.svg b/doc/diagramms/pdf.svg deleted file mode 100644 index 3ebc3d6..0000000 --- a/doc/diagramms/pdf.svg +++ /dev/null @@ -1,3 +0,0 @@ - - -
Process PDF
Process PDF
no
no
For each page:
does page already
contain text?
For each page:...
Split into singe pages
Split into singe pag...
Don't touch page content
Don't touch page con...
OCR page
OCR page
Original page
Original page
Original page
+ OCR text layer
Original page...
Merge all pages
Merge all pages
New PDF document
New PDF document
yes
yes
Does PDF contain
 at least one page
without text?
Does PDF contain...
Don't touch whole PDF file
Don't touch whole PD...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/lib/AppInfo/Application.php b/lib/AppInfo/Application.php index 45aa9bd..8f28564 100644 --- a/lib/AppInfo/Application.php +++ b/lib/AppInfo/Application.php @@ -27,25 +27,19 @@ namespace OCA\WorkflowOcr\AppInfo; +use OCA\WorkflowOcr\Helper\IProcessingFileAccessor; +use OCA\WorkflowOcr\Helper\ProcessingFileAccessor; use OCA\WorkflowOcr\Listener\RegisterFlowOperationsListener; use OCA\WorkflowOcr\OcrProcessors\IOcrProcessorFactory; use OCA\WorkflowOcr\OcrProcessors\OcrProcessorFactory; use OCA\WorkflowOcr\Service\IOcrService; use OCA\WorkflowOcr\Service\OcrService; +use OCA\WorkflowOcr\Wrapper\CommandWrapper; use OCA\WorkflowOcr\Wrapper\Filesystem; -use OCA\WorkflowOcr\Wrapper\FpdiWrapper; +use OCA\WorkflowOcr\Wrapper\ICommand; use OCA\WorkflowOcr\Wrapper\IFilesystem; -use OCA\WorkflowOcr\Wrapper\IFpdi; -use OCA\WorkflowOcr\Wrapper\IImagick; -use OCA\WorkflowOcr\Wrapper\ImagickWrapper; -use OCA\WorkflowOcr\Wrapper\IPdfParser; -use OCA\WorkflowOcr\Wrapper\ITesseractOcr; use OCA\WorkflowOcr\Wrapper\IViewFactory; -use OCA\WorkflowOcr\Wrapper\IWrapperFactory; -use OCA\WorkflowOcr\Wrapper\PdfParserWrapper; -use OCA\WorkflowOcr\Wrapper\TesseractOcrWrapper; use OCA\WorkflowOcr\Wrapper\ViewFactory; -use OCA\WorkflowOcr\Wrapper\WrapperFactory; use OCP\AppFramework\App; use OCP\AppFramework\Bootstrap\IBootContext; use OCP\AppFramework\Bootstrap\IBootstrap; @@ -69,13 +63,13 @@ public function __construct(array $urlParams = []) { public function register(IRegistrationContext $context): void { $context->registerServiceAlias(IOcrService::class, OcrService::class); $context->registerServiceAlias(IOcrProcessorFactory::class, OcrProcessorFactory::class); - $context->registerServiceAlias(IPdfParser::class, PdfParserWrapper::class); - $context->registerServiceAlias(IImagick::class, ImagickWrapper::class); - $context->registerServiceAlias(ITesseractOcr::class, TesseractOcrWrapper::class); $context->registerServiceAlias(IViewFactory::class, ViewFactory::class); - $context->registerServiceAlias(IFpdi::class, FpdiWrapper::class); - $context->registerServiceAlias(IWrapperFactory::class, WrapperFactory::class); $context->registerServiceAlias(IFilesystem::class, Filesystem::class); + $context->registerServiceAlias(ICommand::class, CommandWrapper::class); + + $context->registerService(IProcessingFileAccessor::class, function () { + return ProcessingFileAccessor::getInstance(); + }); $context->registerEventListener(RegisterOperationsEvent::class, RegisterFlowOperationsListener::class); } diff --git a/lib/BackgroundJobs/ProcessFileJob.php b/lib/BackgroundJobs/ProcessFileJob.php index 41694a9..648ca5a 100644 --- a/lib/BackgroundJobs/ProcessFileJob.php +++ b/lib/BackgroundJobs/ProcessFileJob.php @@ -32,6 +32,7 @@ use \OCP\Files\File; use OCA\WorkflowOcr\Exception\OcrNotPossibleException; use OCA\WorkflowOcr\Exception\OcrProcessorNotFoundException; +use OCA\WorkflowOcr\Helper\IProcessingFileAccessor; use OCA\WorkflowOcr\Service\IOcrService; use OCA\WorkflowOcr\Wrapper\IFilesystem; use OCA\WorkflowOcr\Wrapper\IViewFactory; @@ -62,6 +63,8 @@ class ProcessFileJob extends \OC\BackgroundJob\QueuedJob { private $userManager; /** @var IUserSession */ private $userSession; + /** @var IProcessingFileAccessor */ + private $processingFileAccessor; public function __construct( LoggerInterface $logger, @@ -70,7 +73,8 @@ public function __construct( IViewFactory $viewFactory, IFilesystem $filesystem, IUserManager $userManager, - IUserSession $userSession) { + IUserSession $userSession, + IProcessingFileAccessor $processingFileAccessor) { $this->logger = $logger; $this->rootFolder = $rootFolder; $this->ocrService = $ocrService; @@ -78,6 +82,7 @@ public function __construct( $this->filesystem = $filesystem; $this->userManager = $userManager; $this->userSession = $userSession; + $this->processingFileAccessor = $processingFileAccessor; } /** @@ -153,6 +158,7 @@ private function processFile(string $filePath) : void { $this->logger->info('Skipping process for \'' . $filePath . '\'. It is not a file'); return; } + try { $ocrFile = $this->ocrFile($node); } catch (OcrNotPossibleException $ocrNpEx) { @@ -163,12 +169,7 @@ private function processFile(string $filePath) : void { return; } - $dirPath = dirname($filePath); - $filename = basename($filePath); - - // Create new file or file-version with OCR-file - $view = $this->viewFactory->create($dirPath); - $view->file_put_contents($filename, $ocrFile); + $this->createNewFileVersion($filePath, $ocrFile, $node->getId()); } /** @@ -192,10 +193,30 @@ private function ocrFile(File $file) : string { return $this->ocrService->ocrFile($file->getMimeType(), $file->getContent()); } - /** - * @param string $uid - */ private function shutdownUserEnvironment() : void { $this->userSession->setUser(null); } + + /** + * @param string $filePath The filepath of the file to write + * @param string $ocrContent The new filecontent (which was OCR processed) + * @param string $fileId The id of the file to write. Used for locking. + */ + private function createNewFileVersion(string $filePath, string $ocrContent, int $fileId) : void { + $dirPath = dirname($filePath); + $filename = basename($filePath); + + $this->processingFileAccessor->setCurrentlyProcessedFileId($fileId); + + try { + $view = $this->viewFactory->create($dirPath); + // Create new file or file-version with OCR-file + // This will trigger 'postWrite' event which would normally + // add the file to the queue again but this is tackled + // by the processingFileAccessor. + $view->file_put_contents($filename, $ocrContent); + } finally { + $this->processingFileAccessor->setCurrentlyProcessedFileId(null); + } + } } diff --git a/lib/Wrapper/WrapperFactory.php b/lib/Helper/IProcessingFileAccessor.php similarity index 69% rename from lib/Wrapper/WrapperFactory.php rename to lib/Helper/IProcessingFileAccessor.php index 324d758..251a3a8 100644 --- a/lib/Wrapper/WrapperFactory.php +++ b/lib/Helper/IProcessingFileAccessor.php @@ -21,14 +21,19 @@ * along with this program. If not, see . */ -namespace OCA\WorkflowOcr\Wrapper; +namespace OCA\WorkflowOcr\Helper; -class WrapperFactory implements IWrapperFactory { - public function createFpdi(string $pdfContent = ''): IFpdi { - return new FpdiWrapper($pdfContent); - } +interface IProcessingFileAccessor { + /** + * Returns the id of the file which is currently + * processed via OCR + * @return ?int + */ + public function getCurrentlyProcessedFileId() : ?int; - public function createImagick(): IImagick { - return new ImagickWrapper(); - } + /** + * Sets the id of the file which is currently + * processed via OCR + */ + public function setCurrentlyProcessedFileId(?int $fileId) : void; } diff --git a/lib/Helper/ProcessingFileAccessor.php b/lib/Helper/ProcessingFileAccessor.php new file mode 100644 index 0000000..2ec1760 --- /dev/null +++ b/lib/Helper/ProcessingFileAccessor.php @@ -0,0 +1,63 @@ + + * + * @license GNU AGPL version 3 or any later version + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +namespace OCA\WorkflowOcr\Helper; + +/** + * This class is a singleton which holds the id + * of the currently OCR processed file. This ensures + * that a files is not added to the processing queue + * if the 'postWrite' hook was triggered by a new + * version created by the OCR process. + */ +class ProcessingFileAccessor implements IProcessingFileAccessor { + /** @var ?int */ + private $currentlyProcessedFileId; + + /** @var ProcessingFileAccessor */ + private static $instance; + public static function getInstance() : ProcessingFileAccessor { + if (self::$instance === null) { + self::$instance = new ProcessingFileAccessor(); + } + return self::$instance; + } + + private function __construct() { + // Just ensuring singleton instance ... + } + + /** + * @inheritdoc + */ + public function getCurrentlyProcessedFileId() : ?int { + return $this->currentlyProcessedFileId; + } + + /** + * @inheritdoc + */ + public function setCurrentlyProcessedFileId(?int $fileId) : void { + $this->currentlyProcessedFileId = $fileId; + } +} diff --git a/lib/OcrProcessors/PdfOcrProcessor.php b/lib/OcrProcessors/PdfOcrProcessor.php index 8914fca..f377ce4 100644 --- a/lib/OcrProcessors/PdfOcrProcessor.php +++ b/lib/OcrProcessors/PdfOcrProcessor.php @@ -24,194 +24,30 @@ namespace OCA\WorkflowOcr\OcrProcessors; use OCA\WorkflowOcr\Exception\OcrNotPossibleException; -use OCA\WorkflowOcr\Wrapper\IImagick; -use OCA\WorkflowOcr\Wrapper\IPdfParser; -use OCA\WorkflowOcr\Wrapper\ITesseractOcr; -use OCA\WorkflowOcr\Wrapper\IWrapperFactory; +use OCA\WorkflowOcr\Wrapper\ICommand; class PdfOcrProcessor implements IOcrProcessor { - /** @var IPdfParser */ - private $pdfParser; - /** @var ITesseractOcr */ - private $tesseract; - /** @var IWrapperFactory */ - private $wrapperFactory; + /** @var ICommand */ + private $command; - public function __construct(IPdfParser $pdfParser, ITesseractOcr $tesseract, IWrapperFactory $wrapperFactory) { - $this->pdfParser = $pdfParser; - $this->tesseract = $tesseract; - $this->wrapperFactory = $wrapperFactory; + public function __construct(ICommand $command) { + $this->command = $command; } public function ocrFile(string $fileContent): string { - $pagesTextInfo = $this->getPagesTextInfo($fileContent); - - // Check if at least one page in PDF has no text - $this->ensureCanOcrPdf($pagesTextInfo); - - // Split PDF into single pages - $splitted = $this->splitPdf($fileContent); - - // OCR each single page PDF (if it does not contain text already) - $this->ocrPages($splitted, $pagesTextInfo); - - // Merge results - return $this->mergePdf($splitted); - } - - /** - * Returns an associative array (index (int) => containsText (bool)) with information, if the - * page contains text or not. Index starts a 1. - */ - private function getPagesTextInfo(string $pdfContent) : array { - $pdf = $this->pdfParser->parseContent($pdfContent); - - $tmpCnt = 1; - $indices = []; - $pages = $pdf->getPages(); - - foreach ($pages as $page) { - $txt = $page->getText(); - $indices[$tmpCnt++] = !empty($txt) && !empty(trim($txt)); - } - - return $indices; - } - - private function ensureCanOcrPdf(array $pagesTextInfo) : void { - $onePageWithoutText = false; - - foreach ($pagesTextInfo as $idx => $containsText) { - if (!$containsText) { - $onePageWithoutText = true; - break; - } - } - - if (!$onePageWithoutText) { - throw new OcrNotPossibleException('Pdf only contains pages with text'); - } - } - - /** - * Splits PDF into associative array with 1-based index. - */ - private function splitPdf(string $pdfContent) : array { - try { - $fpdiWrapper = $this->wrapperFactory->createFpdi($pdfContent); - $pagecount = $fpdiWrapper->getPageCount(); - $splitted = []; - - for ($i = 1; $i <= $pagecount; $i++) { - $onePageFpdiWrapper = $this->wrapperFactory->createFpdi($pdfContent); - $pageId = $onePageFpdiWrapper->import($i); - $s = $onePageFpdiWrapper->getTemplatesize($pageId); - $onePageFpdiWrapper->AddPage($s['orientation'], $s); - $onePageFpdiWrapper->useImportedPage($pageId); - - try { - $content = $onePageFpdiWrapper->Output(null, "S"); - $splitted[$i] = $content; - } finally { - $onePageFpdiWrapper->Close(); - $onePageFpdiWrapper->closeStreams(); - } - } - } finally { - if (isset($fpdiWrapper)) { - $fpdiWrapper->Close(); - $fpdiWrapper->closeStreams(); - } - } - - return $splitted; - } - - /** - * Process each PDF page with ocr algorithm except the pages which already - * contain a text layer. - */ - private function ocrPages(array &$splittedPdfPages, array $pagesTextInfo) : void { - foreach ($splittedPdfPages as $i => $onePagePdf) { - // Skip pages containing text - if ($pagesTextInfo[$i] === true) { - continue; - } - - try { - // Use Imagick to convert the pdf page to png - $img = $this->wrapperFactory->createImagick(); - $img->setOption('density', '300'); - $img->readImageBlob($onePagePdf); - $img->setImageFormat("png"); - - $ocrPdf = $this->processSinglePageImagick($img); - - // Take original page format - $original = $this->wrapperFactory->createFpdi($onePagePdf); - $pageId = $original->import(1); - $originalSize = $original->getTemplatesize($pageId); - - // Import single PDF page with ocr layer - $withOcr = $this->wrapperFactory->createFpdi($ocrPdf); - $pageIdOcr = $withOcr->import(1); - $withOcr->AddPage($originalSize['orientation'], $originalSize); - $withOcr->useImportedPage($pageIdOcr, 0, 0, $originalSize['width'], $originalSize['height'], false); - - // Overwrite original page with scanned one - $splittedPdfPages[$i] = $withOcr->Output(null, "S"); - } finally { - if (isset($img)) { - $img->destroy(); - } - if (isset($original)) { - $original->Close(); - $original->closeStreams(); - } - if (isset($withOcr)) { - $withOcr->Close(); - $withOcr->closeStreams(); - } - } - } - } - - private function processSinglePageImagick(IImagick $imagick) : string { - $data = $imagick->getImageBlob(); - $size = $imagick->getImageLength(); - - // Use Tesseract for ocr and converting image back to pdf - $singlePagePdf = $this->tesseract - ->lang(['deu', 'eng']) // TODO make configurable? - ->imageData($data, $size) - ->configFile('pdf') - ->run(); - - return $singlePagePdf; - } - - /** - * Merges single page PDF array into one output PDF. - */ - private function mergePdf(array &$splitted) : string { - try { - $outputPdf = $this->wrapperFactory->createFpdi(); - - foreach ($splitted as $i => $onePageOcrPdf) { - $outputPdf->setContent($onePageOcrPdf); - $pageId = $outputPdf->import(1); - $s = $outputPdf->getTemplatesize($pageId); - $outputPdf->AddPage($s['orientation'], $s); - $outputPdf->useImportedPage($pageId); - } - - $outputPdfContent = $outputPdf->Output(null, "S"); - return $outputPdfContent; - } finally { - if (isset($outputPdf)) { - $outputPdf->Close(); - $outputPdf->closeStreams(); - } + $this->command + ->setCommand("ocrmypdf --redo-ocr -q - - | cat") + ->setStdIn($fileContent); + + $success = $this->command->execute(); + $errorOutput = $this->command->getError(); + $stdErr = $this->command->getStdErr(); + $exitCode = $this->command->getExitCode(); + + if ($success && $errorOutput === '' && $stdErr === '') { + return $this->command->getOutput(); + } else { + throw new OcrNotPossibleException('OCRmyPDF exited abnormally with exit-code ' . $exitCode . '. Message: ' . $errorOutput . ' ' . $stdErr); } } } diff --git a/lib/Operation.php b/lib/Operation.php index 6b119d6..7ad7b64 100644 --- a/lib/Operation.php +++ b/lib/Operation.php @@ -36,6 +36,8 @@ use OCP\WorkflowEngine\IRuleMatcher; use OCP\WorkflowEngine\ISpecificOperation; use OCA\WorkflowOcr\BackgroundJobs\ProcessFileJob; +use OCA\WorkflowOcr\Helper\IProcessingFileAccessor; +use OCA\WorkflowOcr\Helper\SynchronizationHelper; use OCP\Files\FileInfo; use OCP\Files\Node; use OCP\IURLGenerator; @@ -51,12 +53,15 @@ class Operation implements ISpecificOperation { private $logger; /** @var IURLGenerator */ private $urlGenerator; + /** @var SynchronizationHelper */ + private $processingFileAccessor; - public function __construct(IJobList $jobList, IL10N $l, LoggerInterface $logger, IURLGenerator $urlGenerator) { + public function __construct(IJobList $jobList, IL10N $l, LoggerInterface $logger, IURLGenerator $urlGenerator, IProcessingFileAccessor $processingFileAccessor) { $this->jobList = $jobList; $this->l = $l; $this->logger = $logger; $this->urlGenerator = $urlGenerator; + $this->processingFileAccessor = $processingFileAccessor; } /** @@ -100,7 +105,9 @@ public function onEvent(string $eventName, Event $event, IRuleMatcher $ruleMatch return; } - if (!$this->checkNode($node)) { + if (!$this->pathIsValid($node) || + !$this->ownerExists($node) || + $this->eventTriggeredByOcrProcess($node)) { return; } @@ -115,7 +122,7 @@ public function getEntityId(): string { return File::class; } - private function checkNode(Node $node) : bool { + private function pathIsValid(Node $node) : bool { // Check path has valid structure $filePath = $node->getPath(); // '', admin, 'files', 'path/to/file.pdf' @@ -126,14 +133,29 @@ private function checkNode(Node $node) : bool { return false; } - // Check owner exists + return true; + } + + private function ownerExists(Node $node) : bool { + // Check owner of file exists $owner = $node->getOwner(); if ($owner === null) { $this->logger->debug('Not processing event because file with path \'{path}\' has no owner.', - ['path' => $filePath]); + ['path' => $node->getPath()]); return false; } return true; } + + private function eventTriggeredByOcrProcess(Node $node) : bool { + // Check if the event was triggered by OCR rewrite of the file + if ($node->getId() === $this->processingFileAccessor->getCurrentlyProcessedFileId()) { + $this->logger->debug('Not processing event because file with path \'{path}\' was written by OCR process.', + ['path' => $node->getPath()]); + return true; + } + + return false; + } } diff --git a/lib/Wrapper/TesseractOcrWrapper.php b/lib/Wrapper/CommandWrapper.php similarity index 51% rename from lib/Wrapper/TesseractOcrWrapper.php rename to lib/Wrapper/CommandWrapper.php index 1862ace..ebe1264 100644 --- a/lib/Wrapper/TesseractOcrWrapper.php +++ b/lib/Wrapper/CommandWrapper.php @@ -5,9 +5,7 @@ /** * @copyright Copyright (c) 2020 Robin Windey * - * @author Robin Windey - * - * @license GNU AGPL version 3 or any later version + * @license GNU AGPL version 3 or any later version * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as @@ -21,49 +19,68 @@ * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . - * */ namespace OCA\WorkflowOcr\Wrapper; -use thiagoalessio\TesseractOCR\TesseractOCR; +use mikehaertl\shellcommand\Command; -class TesseractOcrWrapper implements ITesseractOcr { - /** @var TesseractOCR */ - private $wrappedTesseract; +class CommandWrapper implements ICommand { + /** @var Command */ + private $command; public function __construct() { - $this->wrappedTesseract = new TesseractOCR(); + $this->command = new Command(); } /** * @inheritdoc */ - public function configFile(string $config) : ITesseractOcr { - $this->wrappedTesseract->configFile($config); + public function setCommand(string $command) : ICommand { + $this->command->setCommand($command); return $this; } /** * @inheritdoc */ - public function lang(array $langs) : ITesseractOcr { - call_user_func_array([$this->wrappedTesseract, 'lang'], array_map('trim', $langs)); + public function setStdIn(string $stdIn) : ICommand { + $this->command->setStdIn($stdIn); return $this; } + + /** + * @inheritdoc + */ + public function execute() : bool { + return (bool)$this->command->execute(); + } + + /** + * @inheritdoc + */ + public function getOutput(bool $trim = true) : string { + return (string)$this->command->getOutput($trim); + } + + /** + * @inheritdoc + */ + public function getError(bool $trim = true) : string { + return (string)$this->command->getError($trim); + } /** * @inheritdoc */ - public function imageData(string $data, int $size) : ITesseractOcr { - $this->wrappedTesseract->imageData($data, $size); - return $this; + public function getStdErr(bool $trim = true) : string { + return (string)$this->command->getStdErr($trim); } /** * @inheritdoc */ - public function run() : string { - return $this->wrappedTesseract->run(); + public function getExitCode() { + return $this->command->getExitCode(); } } diff --git a/lib/Wrapper/FpdiWrapper.php b/lib/Wrapper/FpdiWrapper.php deleted file mode 100644 index 54f7d83..0000000 --- a/lib/Wrapper/FpdiWrapper.php +++ /dev/null @@ -1,78 +0,0 @@ - - * - * @license GNU AGPL version 3 or any later version - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -namespace OCA\WorkflowOcr\Wrapper; - -use setasign\Fpdi\Tcpdf\Fpdi; - -class FpdiWrapper extends Fpdi implements IFpdi { - /** @var resource[] */ - private $streams = []; - /** @var int */ - private $pageCount; - - public function __construct(string $pdfContent = '') { - parent::__construct(); - - if ($pdfContent !== '') { - $this->setContent($pdfContent); - } - - $this->setPrintFooter(false); - $this->setPrintHeader(false); - } - - public function setContent(string $pdfContent) : void { - $stream = $this->createStream($pdfContent); - $this->pageCount = $this->setSourceFile($stream); - } - - public function getPageCount(): int { - return $this->pageCount; - } - - public function closeStreams() : void { - foreach ($this->streams as $stream) { - fclose($stream); - } - } - - public function import(int $pageNumber) : string { - return $this->importPage($pageNumber); - } - - private function createStream(string $pdfContent) { - $stream = fopen('php://temp', 'r+'); - - if (!$stream) { - throw new \Exception("Could not open PDF stream"); - } - - fwrite($stream, $pdfContent); - rewind($stream); - - $this->streams[] = $stream; - - return $stream; - } -} diff --git a/lib/Wrapper/ICommand.php b/lib/Wrapper/ICommand.php new file mode 100644 index 0000000..5e2fe0a --- /dev/null +++ b/lib/Wrapper/ICommand.php @@ -0,0 +1,80 @@ + + * + * @license GNU AGPL version 3 or any later version + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +namespace OCA\WorkflowOcr\Wrapper; + +/** + * Interface for a shell commandline. + */ +interface ICommand { + /** + * @param string $command the command or full command string to execute, + * like 'gzip' or 'gzip -d'. You can still call addArg() to add more + * arguments to the command. If $escapeCommand was set to true, the command + * gets escaped with escapeshellcmd(). + * @return static for method chaining + */ + public function setCommand(string $command) : ICommand; + + /** + * @param string|resource $stdIn If set, the string will be piped to the + * command via standard input. This enables the same functionality as + * piping on the command line. It can also be a resource like a file + * handle or a stream in which case its content will be piped into the + * command like an input redirection. + * @return static for method chaining + */ + public function setStdIn(string $stdIn) : ICommand; + + /** + * Execute the command + * + * @return bool whether execution was successful. If `false`, error details + * can be obtained from getError(), getStdErr() and getExitCode(). + */ + public function execute() : bool; + + /** + * @param bool $trim whether to `trim()` the return value. The default is `true`. + * @return string the command output (stdout). Empty if none. + */ + public function getOutput(bool $trim = true) : string; + + /** + * @param bool $trim whether to `trim()` the return value. The default is `true`. + * @return string the error message, either stderr or an internal message. + * Empty string if none. + */ + public function getError(bool $trim = true) : string; + + /** + * @param bool $trim whether to `trim()` the return value. The default is `true`. + * @return string the stderr output. Empty if none. + */ + public function getStdErr(bool $trim = true) : string; + + /** + * @return int|null the exit code or null if command was not executed yet + */ + public function getExitCode(); +} diff --git a/lib/Wrapper/IImagick.php b/lib/Wrapper/IImagick.php deleted file mode 100644 index 7f8c309..0000000 --- a/lib/Wrapper/IImagick.php +++ /dev/null @@ -1,43 +0,0 @@ - - * - * @author Robin Windey - * - * @license GNU AGPL version 3 or any later version - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - */ - -namespace OCA\WorkflowOcr\Wrapper; - -use Iterator; - -/** - * Interface for wrapping Imagick library - */ -interface IImagick extends Iterator { - public function setOption(string $key, string $value): void; - public function readImageBlob(string $fileContent): void; - public function setImageFormat(string $targetFormat): void; - public function getImageBlob(): string; - public function getImageLength(): int; - public function getNumberImages(): int; - public function clear(): void; - public function destroy() : void; -} diff --git a/lib/Wrapper/IPdfParser.php b/lib/Wrapper/IPdfParser.php deleted file mode 100644 index cd1cb32..0000000 --- a/lib/Wrapper/IPdfParser.php +++ /dev/null @@ -1,41 +0,0 @@ - - * - * @author Robin Windey - * - * @license GNU AGPL version 3 or any later version - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - */ - -namespace OCA\WorkflowOcr\Wrapper; - -use \Smalot\PdfParser\Document; - -/** - * Interface for "wrapping" PdfParser - */ -interface IPdfParser { - /** - * @param $content - * @return Document - * @throws \Exception - */ - public function parseContent($pdfContent) : Document; -} diff --git a/lib/Wrapper/ITesseractOcr.php b/lib/Wrapper/ITesseractOcr.php deleted file mode 100644 index 816ed7d..0000000 --- a/lib/Wrapper/ITesseractOcr.php +++ /dev/null @@ -1,37 +0,0 @@ - - * - * @author Robin Windey - * - * @license GNU AGPL version 3 or any later version - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - */ - -namespace OCA\WorkflowOcr\Wrapper; - -/** - * Interface for wrapping Tesseract OCR library - */ -interface ITesseractOcr { - public function configFile(string $config) : ITesseractOcr; - public function lang(array $langs) : ITesseractOcr; - public function imageData(string $data, int $size) : ITesseractOcr; - public function run() : string; -} diff --git a/lib/Wrapper/ImagickWrapper.php b/lib/Wrapper/ImagickWrapper.php deleted file mode 100644 index a961981..0000000 --- a/lib/Wrapper/ImagickWrapper.php +++ /dev/null @@ -1,115 +0,0 @@ - - * - * @author Robin Windey - * - * @license GNU AGPL version 3 or any later version - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - */ - -namespace OCA\WorkflowOcr\Wrapper; - -class ImagickWrapper implements IImagick { - /** @var \Imagick */ - private $wrappedImagick; - - public function __construct() { - $this->wrappedImagick = new \Imagick(); - } - - /** - * @inheritdoc - */ - public function setOption(string $key, string $value): void { - $this->wrappedImagick->setOption($key, $value); - } - - /** - * @inheritdoc - */ - public function readImageBlob(string $fileContent): void { - $this->wrappedImagick->readImageBlob($fileContent); - } - - /** - * @inheritdoc - */ - public function setImageFormat(string $targetFormat): void { - $this->wrappedImagick->setImageFormat($targetFormat); - } - - /** - * @inheritdoc - */ - public function getImageBlob(): string { - return $this->wrappedImagick->getImageBlob(); - } - - /** - * @inheritdoc - */ - public function getImageLength(): int { - return $this->wrappedImagick->getImageLength(); - } - - /** - * @inheritdoc - */ - public function getNumberImages(): int { - return (int)$this->wrappedImagick->getNumberImages(); - } - - /** - * @inheritdoc - */ - public function clear(): void { - $this->wrappedImagick->clear(); - } - - /** - * @return mixed - */ - public function current() { - return $this->wrappedImagick->current(); - } - - /** - * @return scalar - */ - public function key() { - return $this->wrappedImagick->key(); - } - - public function next() : void { - $this->wrappedImagick->next(); - } - - public function rewind() : void { - $this->wrappedImagick->rewind(); - } - - public function valid() : bool { - return $this->wrappedImagick->valid(); - } - - public function destroy() : void { - $this->wrappedImagick->destroy(); - } -} diff --git a/lib/Wrapper/PdfParserWrapper.php b/lib/Wrapper/PdfParserWrapper.php deleted file mode 100644 index 22241d7..0000000 --- a/lib/Wrapper/PdfParserWrapper.php +++ /dev/null @@ -1,46 +0,0 @@ - - * - * @author Robin Windey - * - * @license GNU AGPL version 3 or any later version - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - */ - -namespace OCA\WorkflowOcr\Wrapper; - -use \Smalot\PdfParser\Document; -use \Smalot\PdfParser\Parser; - -class PdfParserWrapper implements IPdfParser { - /** @var Parser */ - private $wrappedParser; - - public function __construct() { - $this->wrappedParser = new Parser(); - } - - /** - * @inheritdoc - */ - public function parseContent($pdfContent) : Document { - return $this->wrappedParser->parseContent($pdfContent); - } -} diff --git a/lib/Wrapper/ViewWrapper.php b/lib/Wrapper/ViewWrapper.php index 6f4a349..04be318 100644 --- a/lib/Wrapper/ViewWrapper.php +++ b/lib/Wrapper/ViewWrapper.php @@ -44,6 +44,6 @@ public function file_put_contents(string $filePath, string $content) : bool { if (is_bool($retVal)) { return $retVal; } - return boolval($retVal); // TODO :: method above returns numeric value (e.g. 10023) + return boolval($retVal); } } diff --git a/tests/Integration/ViewWrapperTest.php b/tests/Integration/ViewWrapperTest.php new file mode 100644 index 0000000..e36166c --- /dev/null +++ b/tests/Integration/ViewWrapperTest.php @@ -0,0 +1,89 @@ + + * + * @license GNU AGPL version 3 or any later version + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +namespace OCA\WorkflowOcr\Tests\Integration; + +use Exception; +use OC\Files\View; +use OCA\WorkflowOcr\Tests\TestUtils; +use OCA\WorkflowOcr\Wrapper\ViewWrapper; +use Test\TestCase; + +/** + * @group DB + */ +class ViewWrapperTest extends TestCase { + + /** @var TestUtils */ + private $testUtils; + + protected function setUp() : void { + parent::setUp(); + $this->testUtils = new TestUtils(); + } + + /** + * @dataProvider dataProvider_FilePutContents + */ + public function testFilePutContents(string $filename, bool $expectedResult) { + $user = 'mytestuser'; + $pw = 'myuserspw'; + $path = '/mytestuser/files'; + $content = 'hello world'; + + /** @var \OCP\IUser */ + $userObject = null; + + try { + $userObject = $this->testUtils->createUser($user, $pw); + $this->loginAsUser($user); + + $viewWrapper = new ViewWrapper($path); + + $result = $viewWrapper->file_put_contents($filename, $content); + $this->assertEquals($expectedResult, $result); + + // If we expect that we can write to the file we should + // be able to read the file afterwards + if ($expectedResult) { + $ncView = new View($path); + $readContent = $ncView->file_get_contents($filename); + $this->assertEquals($content, $readContent); + } + } finally { + if ($userObject) { + $this->logout(); + if (!$userObject->delete()) { + throw new Exception("Could not delete user " . $user); + } + } + } + } + + public function dataProvider_FilePutContents() { + return [ + ['testfile.txt', true], + ['this_is_invalid/..', false] + ]; + } +} diff --git a/tests/Unit/AppInfo/ApplicationTest.php b/tests/Unit/AppInfo/ApplicationTest.php index 22c2acb..f5fe94d 100644 --- a/tests/Unit/AppInfo/ApplicationTest.php +++ b/tests/Unit/AppInfo/ApplicationTest.php @@ -47,9 +47,9 @@ public function testAutoloadExecutedOnBoot() { $app->boot($bootContext); - // PdfParser is one of the dependencies included by autoload.php - $phpParserExists = class_exists('Smalot\PdfParser\Parser'); - $this->assertTrue($phpParserExists); + // 'Command' is one of the dependencies included by autoload.php + $commandClassExists = class_exists('mikehaertl\shellcommand\Command'); + $this->assertTrue($commandClassExists); } /** diff --git a/tests/Unit/BackgroundJobs/ProcessFileJobTest.php b/tests/Unit/BackgroundJobs/ProcessFileJobTest.php index 7c49047..cd60e13 100644 --- a/tests/Unit/BackgroundJobs/ProcessFileJobTest.php +++ b/tests/Unit/BackgroundJobs/ProcessFileJobTest.php @@ -29,6 +29,7 @@ use OCA\WorkflowOcr\BackgroundJobs\ProcessFileJob; use OCA\WorkflowOcr\Exception\OcrNotPossibleException; use OCA\WorkflowOcr\Exception\OcrProcessorNotFoundException; +use OCA\WorkflowOcr\Helper\IProcessingFileAccessor; use OCA\WorkflowOcr\Service\IOcrService; use OCA\WorkflowOcr\Wrapper\IFilesystem; use OCA\WorkflowOcr\Wrapper\IView; @@ -68,6 +69,8 @@ class ProcessFileJobTest extends TestCase { private $userManager; /** @var IUser|MockObject */ private $user; + /** @var IProcessingFileAccessor|MockObject */ + private $processingFileAccessor; /** @var JobList */ private $jobList; /** @var ProcessFileJob */ @@ -76,18 +79,13 @@ class ProcessFileJobTest extends TestCase { public function setUp() : void { parent::setUp(); - /** @var LoggerInterface */ $this->logger = $this->createMock(LoggerInterface::class); - /** @var IRootFolder */ $this->rootFolder = $this->createMock(IRootFolder::class); - /** @var IOcrService */ $this->ocrService = $this->createMock(IOcrService::class); - /** @var IViewFactory */ $this->viewFactory = $this->createMock(IViewFactory::class); - /** @var IFilesystem */ $this->filesystem = $this->createMock(IFilesystem::class); - /** @var IUserSession */ $this->userSession = $this->createMock(IUserSession::class); + $this->processingFileAccessor = $this->createMock(IProcessingFileAccessor::class); $userManager = $this->createMock(IUserManager::class); $user = $this->createMock(IUser::class); @@ -95,9 +93,7 @@ public function setUp() : void { ->withAnyParameters() ->willReturn($user); - /** @var IUserManager */ $this->userManager = $userManager; - /** @var IUser */ $this->user = $user; $this->processFileJob = new ProcessFileJob( @@ -107,7 +103,8 @@ public function setUp() : void { $this->viewFactory, $this->filesystem, $this->userManager, - $this->userSession + $this->userSession, + $this->processingFileAccessor ); /** @var IConfig */ @@ -337,7 +334,8 @@ public function testThrowsNoUserException_OnNonExistingUser() { $this->viewFactory, $this->filesystem, $userManager, - $this->userSession + $this->userSession, + $this->processingFileAccessor ); $arguments = ['filePath' => '/admin/files/someInvalidStuff', 'uid' => 'nonexistinguser']; $processFileJob->setArgument($arguments); @@ -345,6 +343,53 @@ public function testThrowsNoUserException_OnNonExistingUser() { $processFileJob->execute($this->jobList); } + /** + * @dataProvider dataProvider_ValidArguments + */ + public function testCallsProcessingFileAccessor(array $arguments, string $user, string $rootFolderPath) { + $this->processFileJob->setArgument($arguments); + $mimeType = 'application/pdf'; + $content = 'someFileContent'; + $ocrContent = 'someOcrProcessedFile'; + $filePath = $arguments['filePath']; + $dirPath = dirname($filePath); + $filename = basename($filePath); + + $fileMock = $this->createValidFileMock($mimeType, $content); + $this->rootFolder->method('get') + ->with($arguments['filePath']) + ->willReturn($fileMock); + + $this->ocrService->expects($this->once()) + ->method('ocrFile') + ->willReturn($ocrContent); + + $viewMock = $this->createMock(IView::class); + $this->viewFactory->expects($this->once()) + ->method('create') + ->willReturn($viewMock); + + $calledWith42 = 0; + $calledWithNull = 0; + + $this->processingFileAccessor->expects($this->exactly(2)) + ->method('setCurrentlyProcessedFileId') + ->with($this->callback(function ($id) use (&$calledWith42, &$calledWithNull) { + if ($id === 42) { + $calledWith42++; + } elseif ($id === null) { + $calledWithNull++; + } + + return true; + })); + + $this->processFileJob->execute($this->jobList); + + $this->assertEquals(1, $calledWith42); + $this->assertEquals(1, $calledWithNull); + } + public function dataProvider_InvalidArguments() { $arr = [ [null, 1], @@ -396,6 +441,8 @@ private function createValidFileMock(string $mimeType = 'application/pdf', strin ->willReturn($mimeType); $fileMock->method('getContent') ->willReturn($content); + $fileMock->method('getId') + ->willReturn(42); return $fileMock; } } diff --git a/lib/Wrapper/IWrapperFactory.php b/tests/Unit/Helper/ProcessingFIleAccessorTest.php similarity index 59% rename from lib/Wrapper/IWrapperFactory.php rename to tests/Unit/Helper/ProcessingFIleAccessorTest.php index 384b7a1..4d220e1 100644 --- a/lib/Wrapper/IWrapperFactory.php +++ b/tests/Unit/Helper/ProcessingFIleAccessorTest.php @@ -21,9 +21,23 @@ * along with this program. If not, see . */ -namespace OCA\WorkflowOcr\Wrapper; +namespace OCA\WorkflowOcr\Tests\Unit\Helper; -interface IWrapperFactory { - public function createFpdi(string $pdfContent = '') : IFpdi; - public function createImagick() : IImagick; +use OCA\WorkflowOcr\Helper\ProcessingFileAccessor; +use PHPUnit\Framework\TestCase; + +class ProcessingFileAccessorTest extends TestCase { + public function testSingleton() { + $o1 = ProcessingFileAccessor::getInstance(); + $o2 = ProcessingFileAccessor::getInstance(); + + $this->assertTrue($o1 === $o2); + } + + public function testGetSet() { + $o = ProcessingFileAccessor::getInstance(); + $o ->setCurrentlyProcessedFileId(42); + $this->assertEquals(42, $o->getCurrentlyProcessedFileId()); + $o->setCurrentlyProcessedFileId(null); + } } diff --git a/tests/Unit/OcrProcessors/PdfOcrProcessorTest.php b/tests/Unit/OcrProcessors/PdfOcrProcessorTest.php index b14dbc3..0fb6c94 100644 --- a/tests/Unit/OcrProcessors/PdfOcrProcessorTest.php +++ b/tests/Unit/OcrProcessors/PdfOcrProcessorTest.php @@ -23,161 +23,77 @@ namespace OCA\WorkflowOcr\Tests\Unit\OcrProcessors; -use PHPUnit\Framework\TestCase; use OCA\WorkflowOcr\Exception\OcrNotPossibleException; use OCA\WorkflowOcr\OcrProcessors\PdfOcrProcessor; -use OCA\WorkflowOcr\Wrapper\IFpdi; -use OCA\WorkflowOcr\Wrapper\IImagick; -use OCA\WorkflowOcr\Wrapper\IPdfParser; -use OCA\WorkflowOcr\Wrapper\ITesseractOcr; -use OCA\WorkflowOcr\Wrapper\IWrapperFactory; +use OCA\WorkflowOcr\Wrapper\ICommand; use PHPUnit\Framework\MockObject\MockObject; -use \Smalot\PdfParser\Document; -use Smalot\PdfParser\Page; +use PHPUnit\Framework\TestCase; class PdfOcrProcessorTest extends TestCase { - /** @var MockObject|IPdfParser */ - private $pdfParser; - /** @var MockObject|ITesseractOcr */ - private $tesseract; - /** @var MockObject|IWrapperFactory */ - private $wrapperFactory; - /** @var MockObject|IFpdi */ - private $fpdi; - /** @var MockObject|IImagick */ - private $imagick; + /** @var ICommand|MockObject */ + private $command; protected function setUp(): void { parent::setUp(); - $this->pdfParser = $this->createMock(IPdfParser::class); - $this->tesseract = $this->createMock(ITesseractOcr::class); - $this->wrapperFactory = $this->createMock(IWrapperFactory::class); - $this->fpdi = $this->createMock(IFpdi::class); - $this->imagick = $this->createMock(IImagick::class); - $this->wrapperFactory->method('createFpdi') - ->withAnyParameters() - ->willReturn($this->fpdi); - $this->wrapperFactory->method('createImagick') - ->with() - ->willReturn($this->imagick); - } - - public function testThrowsOcrNotPossibleException_IfPdfContainsPagesWithTextOnly() { - /* - Setup fake PDF document with 2 pages containing text-layer - */ - $fakePdfDocument = $this->setUpFakePdfDocument('Page1Text', 'Page2Text'); - $this->pdfParser->expects($this->once()) - ->method('parseContent') - ->with('someBinaryPdfContent') - ->willReturn($fakePdfDocument); - - $this->expectException(OcrNotPossibleException::class); - $pdfProcessor = new PdfOcrProcessor($this->pdfParser, $this->tesseract, $this->wrapperFactory); - $pdfProcessor->ocrFile('someBinaryPdfContent'); - } - - public function testSplitPdfIsDone() { - /* - Setup fake PDF document with 3 pages containing no text-layers - */ - $fakePdfDocument = $this->setUpFakePdfDocument('', '', ''); - $this->pdfParser->expects($this->once()) - ->method('parseContent') - ->with('someBinaryPdfContent') - ->willReturn($fakePdfDocument); - $this->fpdi->expects($this->once()) - ->method('getPageCount') - ->with() - ->willReturn(3); - $this->fpdi->method('getTemplatesize') - ->willReturn([ - 'orientation' => 'someOrientation', - 'width' => 50, - 'height' => 50 - ]); - $this->fpdi->method('Output') - ->with(null, "S") - ->willReturn('someBinaryPdfContentOfOnePage'); - $this->fpdi->expects($this->atLeast(3)) - ->method('import') - ->with($this->logicalOr($this->equalTo(1), $this->equalTo(2), $this->equalTo(3))); - - $pdfProcessor = new PdfOcrProcessor($this->pdfParser, $this->tesseract, $this->wrapperFactory); - $pdfProcessor->ocrFile('someBinaryPdfContent'); + $this->command = $this->createMock(ICommand::class); } - public function testOcrIsCalledForEachPageWithoutText() { - /* - Setup fake PDF document with 3 pages. 2 without text-layer and one with text-layer. - */ - $fakePdfDocument = $this->setUpFakePdfDocument('', 'thisPageContainsText', ''); - $this->pdfParser->expects($this->once()) - ->method('parseContent') - ->with('someBinaryPdfContent') - ->willReturn($fakePdfDocument); - $this->fpdi->expects($this->once()) - ->method('getPageCount') - ->with() - ->willReturn(3); - $this->fpdi->method('getTemplatesize') - ->willReturn([ - 'orientation' => 'someOrientation', - 'width' => 50, - 'height' => 50 - ]); - $this->fpdi->method('Output') - ->with(null, "S") - ->willReturn('someBinaryPdfContentOfOnePage'); + public function testCallsCommandInterface() { + $pdfBefore = 'someFileContent'; + $pdfAfter = 'someOcrFileContent'; + + $this->command->expects($this->once()) + ->method('setCommand') + ->willReturn($this->command); + $this->command->expects($this->once()) + ->method('setStdIn') + ->with($pdfBefore) + ->willReturn($this->command); + $this->command->expects($this->once()) + ->method('execute') + ->willReturn(true); + $this->command->expects($this->once()) + ->method('getOutput') + ->willReturn($pdfAfter); - $imageBlob = 'someImageBlob'; - $imageSize = 16; - $this->imagick->method('getImageBlob') - ->with() - ->willReturn($imageBlob); - $this->imagick->method('getImageLength') - ->with() - ->willReturn($imageSize); - - // These methods are called for each page which is processed - $this->tesseract->expects($this->exactly(2)) - ->method('lang') - ->with(['deu', 'eng']) - ->willReturn($this->tesseract); - $this->tesseract->expects($this->exactly(2)) - ->method('imageData') - ->with($imageBlob, $imageSize) - ->willReturn($this->tesseract); - $this->tesseract->expects($this->exactly(2)) - ->method('configFile') - ->with('pdf') - ->willReturn($this->tesseract); - $this->tesseract->expects($this->exactly(2)) - ->method('run') - ->with(); - - $pdfProcessor = new PdfOcrProcessor($this->pdfParser, $this->tesseract, $this->wrapperFactory); - $pdfProcessor->ocrFile('someBinaryPdfContent'); + $processor = new PdfOcrProcessor($this->command); + $result = $processor->ocrFile($pdfBefore); + + $this->assertEquals($pdfAfter, $result); } - private function setUpFakePdfDocument(...$pageTexts) : MockObject { - $pageArray = []; - foreach ($pageTexts as $pageText) { - $fakePage = $this->createMock(Page::class); - $fakePage->expects($this->once()) - ->method('getText') - ->with() - ->willReturn($pageText); - $pageArray[] = $fakePage; + public function testThrowsOcrNotPossibleException() { + $pdfBefore = 'someFileContent'; + $pdfAfter = 'someOcrFileContent'; + + $this->command->expects($this->once()) + ->method('setCommand') + ->willReturn($this->command); + $this->command->expects($this->once()) + ->method('setStdIn') + ->with($pdfBefore) + ->willReturn($this->command); + $this->command->expects($this->once()) + ->method('execute') + ->willReturn(false); + $this->command->expects($this->never()) + ->method('getOutput'); + $this->command->expects($this->once()) + ->method('getError'); + $this->command->expects($this->once()) + ->method('getExitCode'); + + $processor = new PdfOcrProcessor($this->command); + $thrown = false; + + try { + $result = $processor->ocrFile($pdfBefore); + } catch (\Throwable $t) { + $thrown = true; + $this->assertInstanceOf(OcrNotPossibleException::class, $t); } - - $fakePdfDocument = $this->createMock(Document::class); - $fakePdfDocument->expects($this->once()) - ->method('getPages') - ->with() - ->willReturn($pageArray); - - return $fakePdfDocument; + + $this->assertTrue($thrown); } } diff --git a/tests/Unit/OperationTest.php b/tests/Unit/OperationTest.php index 6a3aa9e..0a3d588 100644 --- a/tests/Unit/OperationTest.php +++ b/tests/Unit/OperationTest.php @@ -25,6 +25,7 @@ use OCA\WorkflowEngine\Entity\File; use OCA\WorkflowOcr\BackgroundJobs\ProcessFileJob; +use OCA\WorkflowOcr\Helper\IProcessingFileAccessor; use OCA\WorkflowOcr\Operation; use OCP\BackgroundJob\IJobList; use OCP\EventDispatcher\Event; @@ -50,13 +51,16 @@ class OperationTest extends TestCase { private $logger; /** @var IURLGenerator|MockObject */ private $urlGenerator; - + /** @var IProcessingFileAccessor|MockObject */ + private $processingFileAccessor; + protected function setUp(): void { parent::setUp(); $this->jobList = $this->createMock(IJobList::class); $this->l = $this->createMock(IL10N::class); $this->logger = $this->createMock(LoggerInterface::class); $this->urlGenerator = $this->createMock(IURLGenerator::class); + $this->processingFileAccessor = $this->createMock(IProcessingFileAccessor::class); } /** @@ -70,7 +74,7 @@ public function testDoesNothingOnInvalidEvent(string $eventName, Event $event) { ->method('debug') ->withAnyParameters(); - $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator); + $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor); /** @var IRuleMatcher */ $ruleMatcher = $this->createMock(IRuleMatcher::class); $operation->onEvent($eventName, $event, $ruleMatcher); @@ -84,7 +88,7 @@ public function testDoesNothingOnFolderEvent() { ->method('debug') ->withAnyParameters(); - $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator); + $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor); $fileMock = $this->createMock(Node::class); $fileMock->method('getType') @@ -97,6 +101,43 @@ public function testDoesNothingOnFolderEvent() { $operation->onEvent($eventName, $event, $ruleMatcher); } + public function testDoesNothingOnPostWriteTriggeredByCurrentOcrProcess() { + $this->jobList->expects($this->never()) + ->method('add') + ->withAnyParameters(); + $this->logger->expects($this->once()) + ->method('debug') + ->withAnyParameters(); + + /** @var IProcessingFileAccessor|MockObject */ + $processingFileAccessorMock = $this->createMock(IProcessingFileAccessor::class); + $processingFileAccessorMock->expects($this->once()) + ->method('getCurrentlyProcessedFileId') + ->willReturn(42); + + $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $processingFileAccessorMock); + + $userMock = $this->createMock(IUser::class); + $userMock->expects($this->never()) + ->method('getUID'); + $fileMock = $this->createMock(Node::class); + $fileMock->method('getType') + ->willReturn(FileInfo::TYPE_FILE); + $fileMock->method('getPath') + ->willReturn('/someuser/files/somefile.pdf'); + $fileMock->method('getOwner') + ->willReturn($userMock); + $fileMock->method('getId') + ->willReturn(42); + $event = new GenericEvent($fileMock); + /** @var IRuleMatcher */ + $ruleMatcher = $this->createMock(IRuleMatcher::class); + $eventName = '\OCP\Files::postCreate'; + + $operation->onEvent($eventName, $event, $ruleMatcher); + } + + /** * @dataProvider dataProvider_InvalidFilePaths */ @@ -108,7 +149,7 @@ public function testDoesNothingOnInvalidFilePath(string $filePath) { ->method('debug') ->withAnyParameters(); - $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator); + $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor); $fileMock = $this->createMock(Node::class); $fileMock->method('getType') @@ -131,7 +172,7 @@ public function testDoesNothingOnFileWithoutOwner() { ->method('debug') ->withAnyParameters(); - $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator); + $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor); $fileMock = $this->createMock(Node::class); $fileMock->method('getType') @@ -156,7 +197,7 @@ public function testAddWithCorrectFilePathAndUser() { ->method('add') ->with(ProcessFileJob::class, ['filePath' => $filePath, 'uid' => $uid]); - $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator); + $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor); $userMock = $this->createMock(IUser::class); $userMock->expects($this->once()) @@ -169,6 +210,8 @@ public function testAddWithCorrectFilePathAndUser() { ->willReturn($filePath); $fileMock->method('getOwner') ->willReturn($userMock); + $fileMock->method('getId') + ->willReturn(42); $event = new GenericEvent($fileMock); /** @var IRuleMatcher */ $ruleMatcher = $this->createMock(IRuleMatcher::class); @@ -181,7 +224,7 @@ public function testAddWithCorrectFilePathAndUser() { * @dataProvider dataProvider_ValidScopes */ public function testIsAvailableForScope(int $scope) { - $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator); + $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor); $result = $operation->isAvailableForScope($scope); $this->assertTrue($result); @@ -197,7 +240,7 @@ public function testDoesNothing_OnValidateOperation() { $this->urlGenerator->expects($this->never()) ->method($this->anything()); - $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator); + $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor); $operation->validateOperation('aName', [], 'aOp'); } @@ -206,7 +249,7 @@ public function testCallsLang_OnGetDisplayName() { $this->l->expects($this->once()) ->method('t'); - $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator); + $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor); $operation->getDisplayName(); } @@ -216,7 +259,7 @@ public function testCallsLang_OnGetDescription() { $this->l->expects($this->once()) ->method('t'); - $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator); + $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor); $operation->getDescription(); } @@ -225,13 +268,13 @@ public function testCallsUrlGenerator_OnGetIcon() { $this->urlGenerator->expects($this->once()) ->method('imagePath'); - $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator); + $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor); $operation->getIcon(); } public function testEntityIdIsFile() { - $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator); + $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor); $this->assertEquals(File::class, $operation->getEntityId()); } diff --git a/lib/Wrapper/IFpdi.php b/tests/Unit/Wrapper/CommandWrapperTest.php similarity index 55% rename from lib/Wrapper/IFpdi.php rename to tests/Unit/Wrapper/CommandWrapperTest.php index e591a1f..6afd012 100644 --- a/lib/Wrapper/IFpdi.php +++ b/tests/Unit/Wrapper/CommandWrapperTest.php @@ -21,16 +21,26 @@ * along with this program. If not, see . */ -namespace OCA\WorkflowOcr\Wrapper; +namespace OCA\WorkflowOcr\Tests\Unit\Wrapper; -interface IFpdi { - public function setContent(string $pdfContent) : void; - public function getPageCount(): int; - public function closeStreams() : void; - public function import(int $pageNumber) : string; - public function getTemplateSize(string $tpl); - public function AddPage($orientation='', $format='', $keepmargins=false, $tocpage=false); - public function useImportedPage(string $pageId, $x = 0, $y = 0, $width = null, $height = null, $adjustPageSize = false); - public function Output($name='doc.pdf', $dest='I'); - public function Close(); +use OCA\WorkflowOcr\Wrapper\CommandWrapper; +use PHPUnit\Framework\TestCase; + +class CommandWrapperTest extends TestCase { + public function testWrappingPositiveCommand() { + $cmd = new CommandWrapper(); + $cmd->setCommand('cat') + ->setStdIn('hello'); + $this->assertTrue($cmd->execute()); + $this->assertEquals('hello', $cmd->getOutput()); + $this->assertEquals(0, $cmd->getExitCode()); + } + + public function testWrappingNegativeCommand() { + $cmd = new CommandWrapper(); + $cmd->setCommand('echo hello 1>&2'); + $cmd->execute(); + $this->assertEquals('hello', $cmd->getStdErr()); + $this->assertEquals('', $cmd->getError()); + } }