From 57667ee6b255ade43c3149d505ad21e9eebe7cf8 Mon Sep 17 00:00:00 2001 From: Luciano Hanna El Adji Date: Mon, 26 Apr 2021 07:59:41 -0300 Subject: [PATCH] Refactor: declare pdf whitespaces in global variable to avoid bug (#411) * Refactor: declare pdf whitespaces in global variable to reduce chance of bug using regular whitespace * Refactor: move $pdfWhitespaces and $pdfWhitespacesRegex to Config.php * Bug fix in php 5.6 * Add explicit return types and phpdoc @var * Merge #409 --- src/Smalot/PdfParser/Config.php | 32 +++++++++++ src/Smalot/PdfParser/Parser.php | 2 +- .../PdfParser/RawData/RawDataParser.php | 53 +++++++++++-------- .../Integration/RawData/RawDataParserTest.php | 3 +- 4 files changed, 67 insertions(+), 23 deletions(-) diff --git a/src/Smalot/PdfParser/Config.php b/src/Smalot/PdfParser/Config.php index 2f339b99..1e13081c 100644 --- a/src/Smalot/PdfParser/Config.php +++ b/src/Smalot/PdfParser/Config.php @@ -40,6 +40,18 @@ class Config { private $fontSpaceLimit = -50; + /** + * Represents: (NUL, HT, LF, FF, CR, SP) + * @var string + */ + private $pdfWhitespaces = "\0\t\n\f\r "; + + /** + * Represents: (NUL, HT, LF, FF, CR, SP) + * @var string + */ + private $pdfWhitespacesRegex = '[\0\t\n\f\r ]'; + public function getFontSpaceLimit() { return $this->fontSpaceLimit; @@ -49,4 +61,24 @@ public function setFontSpaceLimit($value) { $this->fontSpaceLimit = $value; } + + public function getPdfWhitespaces(): string + { + return $this->pdfWhitespaces; + } + + public function setPdfWhitespaces(string $pdfWhitespaces): void + { + $this->pdfWhitespaces = $pdfWhitespaces; + } + + public function getPdfWhitespacesRegex(): string + { + return $this->pdfWhitespacesRegex; + } + + public function setPdfWhitespacesRegex(string $pdfWhitespacesRegex): void + { + $this->pdfWhitespacesRegex = $pdfWhitespacesRegex; + } } diff --git a/src/Smalot/PdfParser/Parser.php b/src/Smalot/PdfParser/Parser.php index 6176c38c..0f50b09d 100644 --- a/src/Smalot/PdfParser/Parser.php +++ b/src/Smalot/PdfParser/Parser.php @@ -60,8 +60,8 @@ class Parser public function __construct($cfg = [], Config $config = null) { - $this->rawDataParser = new RawDataParser($cfg); $this->config = $config ?: new Config(); + $this->rawDataParser = new RawDataParser($cfg, $this->config); } /** diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index 3b1309e7..563d718e 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -41,9 +41,15 @@ namespace Smalot\PdfParser\RawData; use Exception; +use Smalot\PdfParser\Config; class RawDataParser { + /** + * @var \Smalot\PdfParser\Config + */ + private $config; + /** * Configuration array. */ @@ -60,12 +66,13 @@ class RawDataParser /** * @param array $cfg Configuration array, default is [] */ - public function __construct($cfg = []) + public function __construct($cfg = [], Config $config = null) { // merge given array with default values $this->cfg = array_merge($this->cfg, $cfg); $this->filterHelper = new FilterHelper(); + $this->config = $config ?: new Config(); } /** @@ -148,8 +155,8 @@ protected function decodeStream($pdfData, $xref, $sdic, $stream) protected function decodeXref($pdfData, $startxref, $xref = []) { $startxref += 4; // 4 is the length of the word 'xref' - // skip initial white space chars: \x00 null (NUL), \x09 horizontal tab (HT), \x0A line feed (LF), \x0C form feed (FF), \x0D carriage return (CR), \x20 space (SP) - $offset = $startxref + strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $startxref); + // skip initial white space chars + $offset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref); // initialize object number $obj_num = 0; // search for cross-reference entries or subsection @@ -463,6 +470,19 @@ protected function decodeXrefStream($pdfData, $startxref, $xref = []) return $xref; } + protected function getObjectHeaderPattern($objRefArr): string + { + // consider all whitespace character (PDF specifications) + return '/'.$objRefArr[0].$this->config->getPdfWhitespacesRegex().$objRefArr[1].$this->config->getPdfWhitespacesRegex().'obj'.'/'; + } + + protected function getObjectHeaderLen($objRefArr): int + { + // "4 0 obj" + // 2 whitespaces + strlen("obj") = 5 + return 5 + \strlen($objRefArr[0]) + \strlen($objRefArr[1]); + } + /** * Get content of indirect object. * @@ -486,18 +506,17 @@ protected function getIndirectObject($pdfData, $xref, $objRef, $offset = 0, $dec if (2 !== \count($objRefArr)) { throw new Exception('Invalid object reference for $obj.'); } - $objHeader = $objRefArr[0].' '.$objRefArr[1].' obj'; + + $objHeaderLen = $this->getObjectHeaderLen($objRefArr); /* * check if we are in position */ - // ignore whitespace characters at offset (NUL, HT, LF, FF, CR, SP) - $offset += strspn($pdfData, "\0\t\n\f\r ", $offset); + // ignore whitespace characters at offset + $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset); // ignore leading zeros for object number $offset += strspn($pdfData, '0', $offset); - // consider all whitespace character (PDF specifications) - $objHeaderPattern = '/'.$objRefArr[0].'[\0\t\n\f\r ]'.$objRefArr[1].'[\0\t\n\f\r ]obj'.'/'; - if (0 == preg_match($objHeaderPattern, substr($pdfData, $offset, \strlen($objHeader)))) { + if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) { // an indirect reference to an undefined object shall be considered a reference to the null object return ['null', 'null', $offset]; } @@ -506,7 +525,7 @@ protected function getIndirectObject($pdfData, $xref, $objRef, $offset = 0, $dec * get content */ // starting position of object content - $offset += \strlen($objHeader); + $offset += $objHeaderLen; $objContentArr = []; $i = 0; // object main index do { @@ -570,16 +589,8 @@ protected function getRawObject($pdfData, $offset = 0) $objtype = ''; // object type to be returned $objval = ''; // object value to be returned - /* - * skip initial white space chars: - * \x00 null (NUL) - * \x09 horizontal tab (HT) - * \x0A line feed (LF) - * \x0C form feed (FF) - * \x0D carriage return (CR) - * \x20 space (SP) - */ - $offset += strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $offset); + // skip initial white space chars + $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset); // get first char $char = $pdfData[$offset]; @@ -694,7 +705,7 @@ protected function getRawObject($pdfData, $offset = 0) ); if (('<' == $char) && 1 == $pregResult) { // remove white space characters - $objval = strtr($matches[1], "\x09\x0a\x0c\x0d\x20", ''); + $objval = strtr($matches[1], $this->config->getPdfWhitespaces(), ''); $offset += \strlen($matches[0]); } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) { $offset = $endpos + 1; diff --git a/tests/Integration/RawData/RawDataParserTest.php b/tests/Integration/RawData/RawDataParserTest.php index 50b35703..7f15870f 100644 --- a/tests/Integration/RawData/RawDataParserTest.php +++ b/tests/Integration/RawData/RawDataParserTest.php @@ -32,6 +32,7 @@ namespace Tests\Smalot\PdfParser\Integration\RawData; +use Smalot\PdfParser\Config; use Smalot\PdfParser\RawData\RawDataParser; use Tests\Smalot\PdfParser\TestCase; @@ -52,7 +53,7 @@ protected function setUp(): void { parent::setUp(); - $this->fixture = new RawDataParserHelper(); + $this->fixture = new RawDataParserHelper([], new Config()); } /**