From c716ff8faa3393b2f91a1924af7c15c9e414a9f4 Mon Sep 17 00:00:00 2001 From: Robin Windey Date: Sat, 14 Dec 2024 16:06:01 +0000 Subject: [PATCH] Add option to keep original file date Implements #256 --- README.md | 1 + lib/Model/WorkflowSettings.php | 11 +++++++++ lib/Service/OcrService.php | 19 ++++++++++++--- lib/Wrapper/IView.php | 1 + lib/Wrapper/ViewWrapper.php | 7 ++++++ src/components/WorkflowOcr.vue | 9 ++++++- src/test/components/WorkflowOcr.spec.js | 32 +++++++++++++++++++++---- tests/Integration/ViewWrapperTest.php | 16 +++++++++++++ tests/Unit/Service/OcrServiceTest.php | 31 +++++++++++++++++++++++- tests/psalm-baseline.xml | 1 + 10 files changed, 118 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index d138bde..53a42f0 100644 --- a/README.md +++ b/README.md @@ -137,6 +137,7 @@ Assign tags after OCR | These tags will be assigned to the file after it has bee Remove tags after OCR | These tags will be removed from the file after it has been successfully processed. If the file does not have the tag, it will just be skipped. | OCR mode | Controls the way files are processed, which already have OCR content. For PDF files this setting corresponds to the `--skip-text`, `--redo-ocr` and `--force-ocr` parameters of `ocrmypdf`. See [official docs](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped) for additional information.
**Skip text:** skip pages completely that already contain text. Such a page will not be touched and just be copied to the final output.
**Redo OCR:** perform a detailed text analysis to split up pages into areas with and without text.
**Force OCR:** all pages will be rasterized to images and OCR will be performed on every page. | Keep original file version | If the switch is set, the original file (before applying OCR) will be kept. This is done by giving the file version the label `Before OC`. This version will be excluded from the automatic expiration process (see [here](https://docs.nextcloud.com/server/latest/user_manual/en/files/version_control.html#naming-a-version) for details) | +Keep original file modification date | Restore the modification date of the original file. The original modification date will be applied to the newly created file version. This is useful if you need to preserve the file modification date, for example to be able to sort files accordingly. | Remove background\* | If the switch is set, the OCR processor will try to remove the background of the document before processing and instead set a white background. For PDF files this setting corresponds to the [`--remove-background`](https://ocrmypdf.readthedocs.io/en/latest/cookbook.html?highlight=remove-background#image-processing) parameter of `ocrmypdf`.
:warning: Please note that this flag will currently only work with **`ocrmypdf` versions prior to 13**. It might be added in future versions again. See [here](https://github.com/ocrmypdf/OCRmyPDF/issues/884) for details. :warning:| Custom ocrMyPdf CLI arguments | If you want to pass custom arguments to the `ocrmypdf` CLI, you can do so here. Please note that the arguments will be passed as they are to the CLI, so make sure to use the correct syntax. Check the [official docs](https://ocrmypdf.readthedocs.io/en/latest/cookbook.html) for more information. | diff --git a/lib/Model/WorkflowSettings.php b/lib/Model/WorkflowSettings.php index 637d173..49aee5a 100644 --- a/lib/Model/WorkflowSettings.php +++ b/lib/Model/WorkflowSettings.php @@ -52,6 +52,9 @@ class WorkflowSettings { /** @var bool */ private $keepOriginalFileVersion = false; + /** @var bool */ + private $keepOriginalFileDate = false; + /** @var string */ private $customCliArgs = ''; @@ -104,6 +107,13 @@ public function getKeepOriginalFileVersion(): bool { return $this->keepOriginalFileVersion; } + /** + * @return bool + */ + public function getKeepOriginalFileDate(): bool { + return $this->keepOriginalFileDate; + } + /** * @return string */ @@ -143,6 +153,7 @@ private function setJson(?string $json = null) { $this->setProperty($this->tagsToRemoveAfterOcr, $data, 'tagsToRemoveAfterOcr', fn ($value) => is_array($value)); $this->setProperty($this->tagsToAddAfterOcr, $data, 'tagsToAddAfterOcr', fn ($value) => is_array($value)); $this->setProperty($this->keepOriginalFileVersion, $data, 'keepOriginalFileVersion', fn ($value) => is_bool($value)); + $this->setProperty($this->keepOriginalFileDate, $data, 'keepOriginalFileDate', fn ($value) => is_bool($value)); $this->setProperty($this->customCliArgs, $data, 'customCliArgs', fn ($value) => is_string($value)); } diff --git a/lib/Service/OcrService.php b/lib/Service/OcrService.php index 1e2f068..6fabcbf 100644 --- a/lib/Service/OcrService.php +++ b/lib/Service/OcrService.php @@ -121,6 +121,13 @@ public function runOcrProcess(int $fileId, string $uid, WorkflowSettings $settin $this->initUserEnvironment($uid); $file = $this->getNode($fileId); + + $fileMtime = null; + if ($settings->getKeepOriginalFileDate()) { + // Add one ms to the original file modification time to prevent the new original version from being overwritten + $fileMtime = $file->getMTime() + 1; + } + $ocrProcessor = $this->ocrProcessorFactory->create($file->getMimeType()); $globalSettings = $this->globalSettingsService->getGlobalSettings(); @@ -153,7 +160,7 @@ public function runOcrProcess(int $fileId, string $uid, WorkflowSettings $settin $filePath : $filePath . '.pdf'; - $this->createNewFileVersion($newFilePath, $fileContent, $fileId); + $this->createNewFileVersion($newFilePath, $fileContent, $fileId, $fileMtime); } $this->eventService->textRecognized($result, $file); @@ -180,7 +187,7 @@ private function shutdownUserEnvironment() : void { $this->userSession->setUser(null); } - private function getNode(int $fileId) : ?Node { + private function getNode(int $fileId) : Node { /** @var File[] */ $nodeArr = $this->rootFolder->getById($fileId); if (count($nodeArr) === 0) { @@ -223,8 +230,9 @@ private function processTagsAfterSuccessfulOcr(File $file, WorkflowSettings $set * @param string $filePath The filepath of the file to write * @param string $ocrContent The new filecontent (which was OCR processed) * @param int $fileId The id of the file to write. Used for locking. + * @param int $fileMtime The mtime of the new file. Can be used to restore the original modification time of the non-OCR file. */ - private function createNewFileVersion(string $filePath, string $ocrContent, int $fileId) : void { + private function createNewFileVersion(string $filePath, string $ocrContent, int $fileId, ?int $fileMtime = null) : void { $dirPath = dirname($filePath); $filename = basename($filePath); @@ -237,6 +245,11 @@ private function createNewFileVersion(string $filePath, string $ocrContent, int // add the file to the queue again but this is tackled // by the processingFileAccessor. $view->file_put_contents($filename, $ocrContent); + + // Restore the original modification time of the non-OCR file + if ($fileMtime !== null) { + $view->touch($filename, $fileMtime); + } } finally { $this->processingFileAccessor->setCurrentlyProcessedFileId(null); } diff --git a/lib/Wrapper/IView.php b/lib/Wrapper/IView.php index 98cdf33..804deff 100644 --- a/lib/Wrapper/IView.php +++ b/lib/Wrapper/IView.php @@ -31,4 +31,5 @@ */ interface IView { public function file_put_contents(string $filePath, string $content) : bool; + public function touch($path, $mtime = null): bool; } diff --git a/lib/Wrapper/ViewWrapper.php b/lib/Wrapper/ViewWrapper.php index bc2912f..6f21acb 100644 --- a/lib/Wrapper/ViewWrapper.php +++ b/lib/Wrapper/ViewWrapper.php @@ -43,4 +43,11 @@ public function file_put_contents(string $filePath, string $content) : bool { $retVal = $this->wrappedView->file_put_contents($filePath, $content); return boolval($retVal); } + + /** + * @inheritdoc + */ + public function touch($path, $mtime = null): bool { + return $this->wrappedView->touch($path, $mtime); + } } diff --git a/src/components/WorkflowOcr.vue b/src/components/WorkflowOcr.vue index fb1077b..3a73f26 100644 --- a/src/components/WorkflowOcr.vue +++ b/src/components/WorkflowOcr.vue @@ -89,11 +89,16 @@ type="switch"> {{ t('workflow_ocr', 'Remove background') }} - {{ t('workflow_ocr', 'Keep original file version') }} + + {{ t('workflow_ocr', 'Keep original file modification date') }} +
@@ -140,6 +145,7 @@ export default { * tagsToRemoveAfterOcr: [42, 43], * removeBackground: true, * keepOriginalFileVersion: true, + * keepOriginalFileDate: true, * ocrMode: 0, * customCliArgs: '--rotate-pages-threshold 8', * } @@ -151,6 +157,7 @@ export default { tagsToRemoveAfterOcr: [], removeBackground: false, keepOriginalFileVersion: false, + keepOriginalFileDate: false, ocrMode: 0, customCliArgs: '', }, diff --git a/src/test/components/WorkflowOcr.spec.js b/src/test/components/WorkflowOcr.spec.js index b86db2b..999445a 100644 --- a/src/test/components/WorkflowOcr.spec.js +++ b/src/test/components/WorkflowOcr.spec.js @@ -153,7 +153,7 @@ describe('Language settings tests', () => { const inputEvent = wrapper.emitted().input expect(inputEvent).toBeTruthy() - expect(inputEvent[0][0]).toBe('{"languages":["de","en"],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[],"removeBackground":true,"keepOriginalFileVersion":false,"ocrMode":0,"customCliArgs":""}') + expect(inputEvent[0][0]).toBe('{"languages":["de","en"],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[],"removeBackground":true,"keepOriginalFileVersion":false,"keepOriginalFileDate":false,"ocrMode":0,"customCliArgs":""}') }) }) @@ -182,7 +182,7 @@ describe('Add/remove tags tests', () => { const inputEvent = wrapper.emitted().input expect(inputEvent).toBeTruthy() - expect(inputEvent[0][0]).toBe('{"languages":["de"],"tagsToAddAfterOcr":[1,2],"tagsToRemoveAfterOcr":[],"removeBackground":true,"keepOriginalFileVersion":false,"ocrMode":0,"customCliArgs":""}') + expect(inputEvent[0][0]).toBe('{"languages":["de"],"tagsToAddAfterOcr":[1,2],"tagsToRemoveAfterOcr":[],"removeBackground":true,"keepOriginalFileVersion":false,"keepOriginalFileDate":false,"ocrMode":0,"customCliArgs":""}') }) test('User input for removeTagsAfterOcr is applied correctly on empty component', async () => { @@ -202,7 +202,7 @@ describe('Add/remove tags tests', () => { const inputEvent = wrapper.emitted().input expect(inputEvent).toBeTruthy() - expect(inputEvent[0][0]).toBe('{"languages":["de"],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[1,2],"removeBackground":true,"keepOriginalFileVersion":false,"ocrMode":0,"customCliArgs":""}') + expect(inputEvent[0][0]).toBe('{"languages":["de"],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[1,2],"removeBackground":true,"keepOriginalFileVersion":false,"keepOriginalFileDate":false,"ocrMode":0,"customCliArgs":""}') }) }) @@ -239,7 +239,7 @@ describe('Remove background tests', () => { const inputEvent = wrapper.emitted().input expect(inputEvent).toBeTruthy() - expect(inputEvent[0][0]).toBe('{"languages":["de"],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[],"removeBackground":false,"keepOriginalFileVersion":false,"ocrMode":0,"customCliArgs":""}') + expect(inputEvent[0][0]).toBe('{"languages":["de"],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[],"removeBackground":false,"keepOriginalFileVersion":false,"keepOriginalFileDate":false,"ocrMode":0,"customCliArgs":""}') }) }) @@ -340,6 +340,28 @@ describe('Custom CLI args test', () => { const inputEvent = wrapper.emitted().input expect(inputEvent).toBeTruthy() - expect(inputEvent[0][0]).toBe('{"languages":[],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[],"removeBackground":false,"keepOriginalFileVersion":false,"ocrMode":0,"customCliArgs":"--dpi 300"}') + expect(inputEvent[0][0]).toBe('{"languages":[],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[],"removeBackground":false,"keepOriginalFileVersion":false,"keepOriginalFileDate":false,"ocrMode":0,"customCliArgs":"--dpi 300"}') + }) +}) + +describe('Original file switches test', () => { + test.each(['keepOriginalFileDate', 'keepOriginalFileVersion'])('Should set %s to true', async (ref) => { + const wrapper = mount(WorkflowOcr, { + propsData: { + value: '{}', + }, + }) + + const switchComponent = wrapper.findComponent({ ref }) + expect(switchComponent.vm.checked).toBe(false) + + // Simulate user input + switchComponent.vm.$emit('update:checked', true) + + await wrapper.vm.$nextTick() + + const inputEvent = wrapper.emitted().input + expect(inputEvent).toBeTruthy() + expect(inputEvent[0][0]).toContain(`"${ref}":true`) }) }) diff --git a/tests/Integration/ViewWrapperTest.php b/tests/Integration/ViewWrapperTest.php index 5522592..9b5f714 100644 --- a/tests/Integration/ViewWrapperTest.php +++ b/tests/Integration/ViewWrapperTest.php @@ -62,6 +62,22 @@ public function testFilePutContents(string $filename, bool $expectedResult) { } } + public function testTouch() { + $path = '/mytestuser/files'; + $filename = 'testfile.txt'; + $viewWrapper = new ViewWrapper($path); + + $result = $viewWrapper->touch($filename); + $this->assertTrue($result); + + $ncView = new View($path); + $this->assertTrue($ncView->file_exists($filename)); + + $viewWrapper->touch($filename, 1234567890); + $stat = $ncView->stat($filename); + $this->assertEquals(1234567890, $stat['mtime']); + } + public function dataProvider_FilePutContents() { return [ ['testfile.txt', true], diff --git a/tests/Unit/Service/OcrServiceTest.php b/tests/Unit/Service/OcrServiceTest.php index 8b26c92..b0476c5 100644 --- a/tests/Unit/Service/OcrServiceTest.php +++ b/tests/Unit/Service/OcrServiceTest.php @@ -519,6 +519,35 @@ public function testOcrEmptyExceptionIsThrown(int $ocrMode) { $this->assertTrue($thrown); } + public function testRestoreOriginalFileModificationDate() { + $settings = new WorkflowSettings('{"keepOriginalFileDate": true}'); + $mimeType = 'application/pdf'; + $content = 'someFileContent'; + $ocrContent = 'someOcrProcessedFile'; + $ocrResult = new OcrProcessorResult($ocrContent, 'pdf', $ocrContent); // Extend this cases if we add new OCR processors + + $fileMock = $this->createValidFileMock($mimeType, $content); + $this->rootFolderGetById42ReturnValue = [$fileMock]; + + $this->ocrProcessor->expects($this->once()) + ->method('ocrFile') + ->willReturn($ocrResult); + + $viewMock = $this->createMock(IView::class); + $this->viewFactory->expects($this->once()) + ->method('create') + ->willReturn($viewMock); + + $fileMock->expects($this->once()) + ->method('getMTime') + ->willReturn(1234); + $viewMock->expects($this->once()) + ->method('touch') + ->with('somefile.pdf', 1235); + + $this->ocrService->runOcrProcess(42, 'usr', $settings); + } + public function dataProvider_InvalidNodes() { /** @var MockObject|Node */ $folderMock = $this->createMock(Node::class); @@ -552,7 +581,7 @@ public function dataProvider_OriginalAndNewFilesnames() { /** * @return File|MockObject */ - private function createValidFileMock(string $mimeType = 'application/pdf', string $content = 'someFileContent', string $rootFolderPath = '/admin/files', string $fileName = 'somefile.pdf') { + private function createValidFileMock(string $mimeType = 'application/pdf', string $content = 'someFileContent', string $rootFolderPath = '/admin/files', string $fileName = 'somefile.pdf'): File { /** @var MockObject|File */ $fileMock = $this->createMock(File::class); $fileMock->method('getType') diff --git a/tests/psalm-baseline.xml b/tests/psalm-baseline.xml index 5375b2c..33944e2 100644 --- a/tests/psalm-baseline.xml +++ b/tests/psalm-baseline.xml @@ -56,6 +56,7 @@ + wrappedView]]> wrappedView]]>