diff --git a/.gitignore b/.gitignore index c685f194..56badf4b 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ debug* composer.lock /.php_cs.cache +/.phpunit.result.cache diff --git a/samples/bugs/Issue356.pdf b/samples/bugs/Issue356.pdf new file mode 100644 index 00000000..7015a591 Binary files /dev/null and b/samples/bugs/Issue356.pdf differ diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index b822ee4e..e6952b7d 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -239,6 +239,7 @@ protected function decodeXrefStream($pdfData, $startxref, $xref = []) } $valid_crs = false; $columns = 0; + $predictor = null; $sarr = $xrefcrs[0][1]; if (!\is_array($sarr)) { $sarr = []; @@ -310,83 +311,95 @@ protected function decodeXrefStream($pdfData, $startxref, $xref = []) // decode data if ($valid_crs && isset($xrefcrs[1][3][0])) { - // number of bytes in a row - $rowlen = ($columns + 1); - // convert the stream into an array of integers - $sdata = unpack('C*', $xrefcrs[1][3][0]); - // split the rows - $sdata = array_chunk($sdata, $rowlen); - // initialize decoded array - $ddata = []; - // initialize first row with zeros - $prev_row = array_fill(0, $rowlen, 0); - // for each row apply PNG unpredictor - foreach ($sdata as $k => $row) { - // initialize new row - $ddata[$k] = []; - // get PNG predictor value - $predictor = (10 + $row[0]); - // for each byte on the row - for ($i = 1; $i <= $columns; ++$i) { - // new index - $j = ($i - 1); - $row_up = $prev_row[$j]; - if (1 == $i) { - $row_left = 0; - $row_upleft = 0; - } else { - $row_left = $row[($i - 1)]; - $row_upleft = $prev_row[($j - 1)]; - } - switch ($predictor) { - case 10: // PNG prediction (on encoding, PNG None on all rows) - $ddata[$k][$j] = $row[$i]; - break; + if (null !== $predictor) { + // number of bytes in a row + $rowlen = ($columns + 1); + // convert the stream into an array of integers + $sdata = unpack('C*', $xrefcrs[1][3][0]); + // split the rows + $sdata = array_chunk($sdata, $rowlen); + + // initialize decoded array + $ddata = []; + // initialize first row with zeros + $prev_row = array_fill(0, $rowlen, 0); + // for each row apply PNG unpredictor + foreach ($sdata as $k => $row) { + // initialize new row + $ddata[$k] = []; + // get PNG predictor value + $predictor = (10 + $row[0]); + // for each byte on the row + for ($i = 1; $i <= $columns; ++$i) { + // new index + $j = ($i - 1); + $row_up = $prev_row[$j]; + if (1 == $i) { + $row_left = 0; + $row_upleft = 0; + } else { + $row_left = $row[($i - 1)]; + $row_upleft = $prev_row[($j - 1)]; + } + switch ($predictor) { + case 10: // PNG prediction (on encoding, PNG None on all rows) + $ddata[$k][$j] = $row[$i]; + break; - case 11: // PNG prediction (on encoding, PNG Sub on all rows) - $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff); - break; + case 11: // PNG prediction (on encoding, PNG Sub on all rows) + $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff); + break; - case 12: // PNG prediction (on encoding, PNG Up on all rows) - $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff); - break; + case 12: // PNG prediction (on encoding, PNG Up on all rows) + $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff); + break; - case 13: // PNG prediction (on encoding, PNG Average on all rows) - $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff); - break; + case 13: // PNG prediction (on encoding, PNG Average on all rows) + $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff); + break; - case 14: // PNG prediction (on encoding, PNG Paeth on all rows) - // initial estimate - $p = ($row_left + $row_up - $row_upleft); - // distances - $pa = abs($p - $row_left); - $pb = abs($p - $row_up); - $pc = abs($p - $row_upleft); - $pmin = min($pa, $pb, $pc); - // return minimum distance - switch ($pmin) { - case $pa: - $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff); - break; - - case $pb: - $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff); - break; - - case $pc: - $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff); - break; - } - break; + case 14: // PNG prediction (on encoding, PNG Paeth on all rows) + // initial estimate + $p = ($row_left + $row_up - $row_upleft); + // distances + $pa = abs($p - $row_left); + $pb = abs($p - $row_up); + $pc = abs($p - $row_upleft); + $pmin = min($pa, $pb, $pc); + // return minimum distance + switch ($pmin) { + case $pa: + $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff); + break; + + case $pb: + $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff); + break; + + case $pc: + $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff); + break; + } + break; - default: // PNG prediction (on encoding, PNG optimum) - throw new Exception('Unknown PNG predictor'); + default: // PNG prediction (on encoding, PNG optimum) + throw new Exception('Unknown PNG predictor: '.$predictor); + } } - } - $prev_row = $ddata[$k]; - } // end for each row - // complete decoding + $prev_row = $ddata[$k]; + } // end for each row + // complete decoding + } else { + // number of bytes in a row + $rowlen = array_sum($wb); + // convert the stream into an array of integers + $sdata = unpack('C*', $xrefcrs[1][3][0]); + // split the rows + $ddata = array_chunk($sdata, $rowlen); + } + $sdata = []; + // for every row foreach ($ddata as $k => $row) { // initialize new row diff --git a/tests/Integration/RawData/RawDataParserTest.php b/tests/Integration/RawData/RawDataParserTest.php index de2c5af0..4847eb24 100644 --- a/tests/Integration/RawData/RawDataParserTest.php +++ b/tests/Integration/RawData/RawDataParserTest.php @@ -84,4 +84,27 @@ public function testGetRawObjectIssue372() $result ); } + + /** + * Tests buggy behavior of decodeXrefStream. + * + * @see https://github.com/smalot/pdfparser/issues/30 + * @see https://github.com/smalot/pdfparser/issues/192 + * @see https://github.com/smalot/pdfparser/issues/209 + * @see https://github.com/smalot/pdfparser/issues/330 + * @see https://github.com/smalot/pdfparser/issues/356 + * @see https://github.com/smalot/pdfparser/issues/373 + * @see https://github.com/smalot/pdfparser/issues/392 + * @see https://github.com/smalot/pdfparser/issues/397 + */ + public function testDecodeXrefStreamIssue356() + { + $filename = $this->rootDir.'/samples/bugs/Issue356.pdf'; + + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + + $this->assertStringContainsString('Ημερήσια έκθεση επιδημιολογικής', $pages[0]->getText()); + } }