diff --git a/doc/CustomConfig.md b/doc/CustomConfig.md index 34d5c1cf..377e6102 100644 --- a/doc/CustomConfig.md +++ b/doc/CustomConfig.md @@ -21,6 +21,7 @@ The `Config` class has the following options: |--------------------------|---------|-----------------|------------------------------------------------------------------------------------------------------------------------------------------------------| | `setDecodeMemoryLimit` | Integer | `0` | If parsing fails because of memory exhaustion, you can set a lower memory limit for decoding operations. | | `setFontSpaceLimit` | Integer | `-50` | Changing font space limit can be helpful when `Parser::getText()` returns a text with too many spaces. | +| `setIgnoreEncryption` | Boolean | `false` | Read PDFs that are not encrypted but have the encryption flag set. This is a temporary workaround, don't rely on it. | | `setHorizontalOffset` | String | ` ` | When words are broken up or when the structure of a table is not preserved, you may get better results when adapting `setHorizontalOffset`. | | `setPdfWhitespaces` | String | `\0\t\n\f\r ` | | | `setPdfWhitespacesRegex` | String | `[\0\t\n\f\r ]` | | @@ -63,3 +64,17 @@ $config->setFontSpaceLimit(-60); $parser = new \Smalot\PdfParser\Parser([], $config); $pdf = $parser->parseFile('document.pdf'); ``` + +## option setIgnoreEncryption + +In some cases PDF files may be internally marked as encrypted even though the content is not encrypted and can be read. +This can be caused by the PDF being created by a tool that does not properly set the encryption flag. +If you are sure that the PDF is not encrypted, you can ignore the encryption flag by setting the `ignoreEncryption` flag to `true` in a custom `Config` instance. + +```php +$config = new \Smalot\PdfParser\Config(); +$config->setIgnoreEncryption(true); + +$parser = new \Smalot\PdfParser\Parser([], $config); +$pdf = $parser->parseFile('document.pdf'); +``` diff --git a/doc/Usage.md b/doc/Usage.md index 864c2924..398c243c 100644 --- a/doc/Usage.md +++ b/doc/Usage.md @@ -230,3 +230,14 @@ foreach ($pages as $page) { ]; } ``` + +## PDF encryption + +This library cannot currently read encrypted PDF files, i.e. those with +a read password. Attempting to do so produces this error: +``` +Exception: Secured pdf file are currently not supported. +``` + +See `setIgnoreEncryption` option in [CustomConfig.md](CustomConfig.md) +for how to override the check in specific cases. diff --git a/samples/not_really_encrypted.pdf b/samples/not_really_encrypted.pdf new file mode 100644 index 00000000..fe841fe8 Binary files /dev/null and b/samples/not_really_encrypted.pdf differ diff --git a/src/Smalot/PdfParser/Config.php b/src/Smalot/PdfParser/Config.php index ff69d3e6..e44b1640 100644 --- a/src/Smalot/PdfParser/Config.php +++ b/src/Smalot/PdfParser/Config.php @@ -82,6 +82,13 @@ class Config */ private $dataTmFontInfoHasToBeIncluded = false; + /** + * Whether to attempt to read PDFs even if they are marked as encrypted. + * + * @var bool + */ + private $ignoreEncryption = false; + public function getFontSpaceLimit() { return $this->fontSpaceLimit; @@ -151,4 +158,18 @@ public function setDataTmFontInfoHasToBeIncluded(bool $dataTmFontInfoHasToBeIncl { $this->dataTmFontInfoHasToBeIncluded = $dataTmFontInfoHasToBeIncluded; } + + public function getIgnoreEncryption(): bool + { + return $this->ignoreEncryption; + } + + /** + * @deprecated this is a temporary workaround, don't rely on it + * @see https://github.com/smalot/pdfparser/pull/653 + */ + public function setIgnoreEncryption(bool $ignoreEncryption): void + { + $this->ignoreEncryption = $ignoreEncryption; + } } diff --git a/src/Smalot/PdfParser/Parser.php b/src/Smalot/PdfParser/Parser.php index d3cac625..86bfe555 100644 --- a/src/Smalot/PdfParser/Parser.php +++ b/src/Smalot/PdfParser/Parser.php @@ -102,7 +102,7 @@ public function parseContent(string $content): Document // Create structure from raw data. list($xref, $data) = $this->rawDataParser->parseData($content); - if (isset($xref['trailer']['encrypt'])) { + if (isset($xref['trailer']['encrypt']) && false === $this->config->getIgnoreEncryption()) { throw new \Exception('Secured pdf file are currently not supported.'); } diff --git a/tests/PHPUnit/Integration/ParserTest.php b/tests/PHPUnit/Integration/ParserTest.php index 29091914..fa0d3f42 100644 --- a/tests/PHPUnit/Integration/ParserTest.php +++ b/tests/PHPUnit/Integration/ParserTest.php @@ -403,6 +403,41 @@ public function testRetainImageContentImpact(): void $this->assertLessThan($baselineMemory * 1.05, $usedMemory, 'Memory is '.$usedMemory); $this->assertTrue('' !== $document->getText()); } + + /** + * Tests handling of encrypted PDF. + * + * @see https://github.com/smalot/pdfparser/pull/653 + */ + public function testNoIgnoreEncryption(): void + { + $filename = $this->rootDir.'/samples/not_really_encrypted.pdf'; + $threw = false; + try { + (new Parser([]))->parseFile($filename); + } catch (\Exception $e) { + // we expect an exception to be thrown if an encrypted PDF is encountered. + $threw = true; + } + $this->assertTrue($threw); + } + + /** + * Tests behavior if encryption is ignored. + * + * @see https://github.com/smalot/pdfparser/pull/653 + */ + public function testIgnoreEncryption(): void + { + $config = new Config(); + $config->setIgnoreEncryption(true); + + $filename = $this->rootDir.'/samples/not_really_encrypted.pdf'; + + $this->assertTrue((new Parser([], $config))->parseFile($filename) instanceof Document); + + // without the configuration option set, an exception would be thrown. + } } class ParserSub extends Parser