Skip to content

Commit

Permalink
Add option to keep original file date
Browse files Browse the repository at this point in the history
Implements #256
  • Loading branch information
R0Wi committed Dec 14, 2024
1 parent a625f45 commit 7994335
Show file tree
Hide file tree
Showing 9 changed files with 117 additions and 10 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ Assign tags after OCR | These tags will be assigned to the file after it has bee
Remove tags after OCR | These tags will be removed from the file after it has been successfully processed. If the file does not have the tag, it will just be skipped. |
OCR mode | Controls the way files are processed, which already have OCR content. For PDF files this setting corresponds to the `--skip-text`, `--redo-ocr` and `--force-ocr` parameters of `ocrmypdf`. See [official docs](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped) for additional information.<br>**Skip text:** skip pages completely that already contain text. Such a page will not be touched and just be copied to the final output.<br>**Redo OCR:** perform a detailed text analysis to split up pages into areas with and without text.<br>**Force OCR:** all pages will be rasterized to images and OCR will be performed on every page. |
Keep original file version | If the switch is set, the original file (before applying OCR) will be kept. This is done by giving the file version the label `Before OC`. This version will be excluded from the automatic expiration process (see [here](https://docs.nextcloud.com/server/latest/user_manual/en/files/version_control.html#naming-a-version) for details) |
Keep original file modification date | Restore the modification date of the original file. The original modification date will be applied to the newly created file version. This is useful if you need to preserve the file modification date, for example to be able to sort files accordingly. |
Remove background\* | If the switch is set, the OCR processor will try to remove the background of the document before processing and instead set a white background. For PDF files this setting corresponds to the [`--remove-background`](https://ocrmypdf.readthedocs.io/en/latest/cookbook.html?highlight=remove-background#image-processing) parameter of `ocrmypdf`.<br/>:warning: Please note that this flag will currently only work with **`ocrmypdf` versions prior to 13**. It might be added in future versions again. See [here](https://github.com/ocrmypdf/OCRmyPDF/issues/884) for details. :warning:|
Custom ocrMyPdf CLI arguments | If you want to pass custom arguments to the `ocrmypdf` CLI, you can do so here. Please note that the arguments will be passed as they are to the CLI, so make sure to use the correct syntax. Check the [official docs](https://ocrmypdf.readthedocs.io/en/latest/cookbook.html) for more information. |

Expand Down
11 changes: 11 additions & 0 deletions lib/Model/WorkflowSettings.php
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ class WorkflowSettings {
/** @var bool */
private $keepOriginalFileVersion = false;

/** @var bool */
private $keepOriginalFileDate = false;

/** @var string */
private $customCliArgs = '';

Expand Down Expand Up @@ -104,6 +107,13 @@ public function getKeepOriginalFileVersion(): bool {
return $this->keepOriginalFileVersion;
}

/**
* @return bool
*/
public function getKeepOriginalFileDate(): bool {
return $this->keepOriginalFileDate;
}

/**
* @return string
*/
Expand Down Expand Up @@ -143,6 +153,7 @@ private function setJson(?string $json = null) {
$this->setProperty($this->tagsToRemoveAfterOcr, $data, 'tagsToRemoveAfterOcr', fn ($value) => is_array($value));
$this->setProperty($this->tagsToAddAfterOcr, $data, 'tagsToAddAfterOcr', fn ($value) => is_array($value));
$this->setProperty($this->keepOriginalFileVersion, $data, 'keepOriginalFileVersion', fn ($value) => is_bool($value));
$this->setProperty($this->keepOriginalFileDate, $data, 'keepOriginalFileDate', fn ($value) => is_bool($value));
$this->setProperty($this->customCliArgs, $data, 'customCliArgs', fn ($value) => is_string($value));
}

Expand Down
19 changes: 16 additions & 3 deletions lib/Service/OcrService.php
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,13 @@ public function runOcrProcess(int $fileId, string $uid, WorkflowSettings $settin
$this->initUserEnvironment($uid);

$file = $this->getNode($fileId);

$fileMtime = null;
if ($settings->getKeepOriginalFileDate()) {
// Add one ms to the original file modification time to prevent the new original version from being overwritten
$fileMtime = $file->getMTime() + 1;
}

$ocrProcessor = $this->ocrProcessorFactory->create($file->getMimeType());
$globalSettings = $this->globalSettingsService->getGlobalSettings();

Expand Down Expand Up @@ -153,7 +160,7 @@ public function runOcrProcess(int $fileId, string $uid, WorkflowSettings $settin
$filePath :
$filePath . '.pdf';

$this->createNewFileVersion($newFilePath, $fileContent, $fileId);
$this->createNewFileVersion($newFilePath, $fileContent, $fileId, $fileMtime);
}

$this->eventService->textRecognized($result, $file);
Expand All @@ -180,7 +187,7 @@ private function shutdownUserEnvironment() : void {
$this->userSession->setUser(null);
}

private function getNode(int $fileId) : ?Node {
private function getNode(int $fileId) : Node {
/** @var File[] */
$nodeArr = $this->rootFolder->getById($fileId);
if (count($nodeArr) === 0) {
Expand Down Expand Up @@ -223,8 +230,9 @@ private function processTagsAfterSuccessfulOcr(File $file, WorkflowSettings $set
* @param string $filePath The filepath of the file to write
* @param string $ocrContent The new filecontent (which was OCR processed)
* @param int $fileId The id of the file to write. Used for locking.
* @param int $fileMtime The mtime of the new file. Can be used to restore the original modification time of the non-OCR file.
*/
private function createNewFileVersion(string $filePath, string $ocrContent, int $fileId) : void {
private function createNewFileVersion(string $filePath, string $ocrContent, int $fileId, ?int $fileMtime = null) : void {
$dirPath = dirname($filePath);
$filename = basename($filePath);

Expand All @@ -237,6 +245,11 @@ private function createNewFileVersion(string $filePath, string $ocrContent, int
// add the file to the queue again but this is tackled
// by the processingFileAccessor.
$view->file_put_contents($filename, $ocrContent);

// Restore the original modification time of the non-OCR file
if ($fileMtime !== null) {
$view->touch($filename, $fileMtime);
}
} finally {
$this->processingFileAccessor->setCurrentlyProcessedFileId(null);
}
Expand Down
1 change: 1 addition & 0 deletions lib/Wrapper/IView.php
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,5 @@
*/
interface IView {
public function file_put_contents(string $filePath, string $content) : bool;
public function touch($path, $mtime = null): bool;
}
7 changes: 7 additions & 0 deletions lib/Wrapper/ViewWrapper.php
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,11 @@ public function file_put_contents(string $filePath, string $content) : bool {
$retVal = $this->wrappedView->file_put_contents($filePath, $content);
return boolval($retVal);
}

/**
* @inheritdoc
*/
public function touch($path, $mtime = null): bool {
return $this->wrappedView->touch($path, $mtime);

Check failure on line 51 in lib/Wrapper/ViewWrapper.php

View workflow job for this annotation

GitHub Actions / Nextcloud dev-master PHP8.1

UndefinedDocblockClass

lib/Wrapper/ViewWrapper.php:51:10: UndefinedDocblockClass: Docblock-defined class, interface or enum named OC\Files\View does not exist (see https://psalm.dev/200)

Check failure on line 51 in lib/Wrapper/ViewWrapper.php

View workflow job for this annotation

GitHub Actions / Nextcloud dev-master PHP8.3

UndefinedDocblockClass

lib/Wrapper/ViewWrapper.php:51:10: UndefinedDocblockClass: Docblock-defined class, interface or enum named OC\Files\View does not exist (see https://psalm.dev/200)
}
}
9 changes: 8 additions & 1 deletion src/components/WorkflowOcr.vue
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,16 @@
type="switch">
{{ t('workflow_ocr', 'Remove background') }}
</NcCheckboxRadioSwitch>
<NcCheckboxRadioSwitch ref="keepOriginalFileVersionSwitch"
<NcCheckboxRadioSwitch ref="keepOriginalFileVersion"
:checked.sync="model.keepOriginalFileVersion"
type="switch">
{{ t('workflow_ocr', 'Keep original file version') }}
</NcCheckboxRadioSwitch>
<NcCheckboxRadioSwitch ref="keepOriginalFileDate"
:checked.sync="model.keepOriginalFileDate"
type="switch">
{{ t('workflow_ocr', 'Keep original file modification date') }}
</NcCheckboxRadioSwitch>
</div>
</SettingsItem>
<div>
Expand Down Expand Up @@ -140,6 +145,7 @@ export default {
* tagsToRemoveAfterOcr: [42, 43],
* removeBackground: true,
* keepOriginalFileVersion: true,
* keepOriginalFileDate: true,
* ocrMode: 0,
* customCliArgs: '--rotate-pages-threshold 8',
* }
Expand All @@ -151,6 +157,7 @@ export default {
tagsToRemoveAfterOcr: [],
removeBackground: false,
keepOriginalFileVersion: false,
keepOriginalFileDate: false,
ocrMode: 0,
customCliArgs: '',
},
Expand Down
32 changes: 27 additions & 5 deletions src/test/components/WorkflowOcr.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ describe('Language settings tests', () => {

const inputEvent = wrapper.emitted().input
expect(inputEvent).toBeTruthy()
expect(inputEvent[0][0]).toBe('{"languages":["de","en"],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[],"removeBackground":true,"keepOriginalFileVersion":false,"ocrMode":0,"customCliArgs":""}')
expect(inputEvent[0][0]).toBe('{"languages":["de","en"],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[],"removeBackground":true,"keepOriginalFileVersion":false,"keepOriginalFileDate":false,"ocrMode":0,"customCliArgs":""}')

})
})
Expand Down Expand Up @@ -182,7 +182,7 @@ describe('Add/remove tags tests', () => {

const inputEvent = wrapper.emitted().input
expect(inputEvent).toBeTruthy()
expect(inputEvent[0][0]).toBe('{"languages":["de"],"tagsToAddAfterOcr":[1,2],"tagsToRemoveAfterOcr":[],"removeBackground":true,"keepOriginalFileVersion":false,"ocrMode":0,"customCliArgs":""}')
expect(inputEvent[0][0]).toBe('{"languages":["de"],"tagsToAddAfterOcr":[1,2],"tagsToRemoveAfterOcr":[],"removeBackground":true,"keepOriginalFileVersion":false,"keepOriginalFileDate":false,"ocrMode":0,"customCliArgs":""}')
})

test('User input for removeTagsAfterOcr is applied correctly on empty component', async () => {
Expand All @@ -202,7 +202,7 @@ describe('Add/remove tags tests', () => {

const inputEvent = wrapper.emitted().input
expect(inputEvent).toBeTruthy()
expect(inputEvent[0][0]).toBe('{"languages":["de"],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[1,2],"removeBackground":true,"keepOriginalFileVersion":false,"ocrMode":0,"customCliArgs":""}')
expect(inputEvent[0][0]).toBe('{"languages":["de"],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[1,2],"removeBackground":true,"keepOriginalFileVersion":false,"keepOriginalFileDate":false,"ocrMode":0,"customCliArgs":""}')
})
})

Expand Down Expand Up @@ -239,7 +239,7 @@ describe('Remove background tests', () => {

const inputEvent = wrapper.emitted().input
expect(inputEvent).toBeTruthy()
expect(inputEvent[0][0]).toBe('{"languages":["de"],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[],"removeBackground":false,"keepOriginalFileVersion":false,"ocrMode":0,"customCliArgs":""}')
expect(inputEvent[0][0]).toBe('{"languages":["de"],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[],"removeBackground":false,"keepOriginalFileVersion":false,"keepOriginalFileDate":false,"ocrMode":0,"customCliArgs":""}')
})
})

Expand Down Expand Up @@ -340,6 +340,28 @@ describe('Custom CLI args test', () => {

const inputEvent = wrapper.emitted().input
expect(inputEvent).toBeTruthy()
expect(inputEvent[0][0]).toBe('{"languages":[],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[],"removeBackground":false,"keepOriginalFileVersion":false,"ocrMode":0,"customCliArgs":"--dpi 300"}')
expect(inputEvent[0][0]).toBe('{"languages":[],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[],"removeBackground":false,"keepOriginalFileVersion":false,"keepOriginalFileDate":false,"ocrMode":0,"customCliArgs":"--dpi 300"}')
})
})

describe('Original file switches test', () => {
test.each(['keepOriginalFileDate', 'keepOriginalFileVersion'])('Should set %s to true', async (ref) => {
const wrapper = mount(WorkflowOcr, {
propsData: {
value: '{}',
},
})

const switchComponent = wrapper.findComponent({ ref })
expect(switchComponent.vm.checked).toBe(false)

// Simulate user input
switchComponent.vm.$emit('update:checked', true)

await wrapper.vm.$nextTick()

const inputEvent = wrapper.emitted().input
expect(inputEvent).toBeTruthy()
expect(inputEvent[0][0]).toContain(`"${ref}":true`)
})
})
16 changes: 16 additions & 0 deletions tests/Integration/ViewWrapperTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,22 @@ public function testFilePutContents(string $filename, bool $expectedResult) {
}
}

public function testTouch() {
$path = '/mytestuser/files';
$filename = 'testfile.txt';
$viewWrapper = new ViewWrapper($path);

$result = $viewWrapper->touch($filename);
$this->assertTrue($result);

$ncView = new View($path);
$this->assertTrue($ncView->file_exists($filename));

$viewWrapper->touch($filename, 1234567890);
$stat = $ncView->stat($filename);
$this->assertEquals(1234567890, $stat['mtime']);
}

public function dataProvider_FilePutContents() {
return [
['testfile.txt', true],
Expand Down
31 changes: 30 additions & 1 deletion tests/Unit/Service/OcrServiceTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,35 @@ public function testOcrEmptyExceptionIsThrown(int $ocrMode) {
$this->assertTrue($thrown);
}

public function testRestoreOriginalFileModificationDate() {
$settings = new WorkflowSettings('{"keepOriginalFileDate": true}');
$mimeType = 'application/pdf';
$content = 'someFileContent';
$ocrContent = 'someOcrProcessedFile';
$ocrResult = new OcrProcessorResult($ocrContent, 'pdf', $ocrContent); // Extend this cases if we add new OCR processors

$fileMock = $this->createValidFileMock($mimeType, $content);
$this->rootFolderGetById42ReturnValue = [$fileMock];

$this->ocrProcessor->expects($this->once())
->method('ocrFile')
->willReturn($ocrResult);

$viewMock = $this->createMock(IView::class);
$this->viewFactory->expects($this->once())
->method('create')
->willReturn($viewMock);

$fileMock->expects($this->once())
->method('getMTime')
->willReturn(1234);
$viewMock->expects($this->once())
->method('touch')
->with('somefile.pdf', 1235);

$this->ocrService->runOcrProcess(42, 'usr', $settings);
}

public function dataProvider_InvalidNodes() {
/** @var MockObject|Node */
$folderMock = $this->createMock(Node::class);
Expand Down Expand Up @@ -552,7 +581,7 @@ public function dataProvider_OriginalAndNewFilesnames() {
/**
* @return File|MockObject
*/
private function createValidFileMock(string $mimeType = 'application/pdf', string $content = 'someFileContent', string $rootFolderPath = '/admin/files', string $fileName = 'somefile.pdf') {
private function createValidFileMock(string $mimeType = 'application/pdf', string $content = 'someFileContent', string $rootFolderPath = '/admin/files', string $fileName = 'somefile.pdf'): File {
/** @var MockObject|File */
$fileMock = $this->createMock(File::class);
$fileMock->method('getType')
Expand Down

0 comments on commit 7994335

Please sign in to comment.