Skip to content

Commit

Permalink
Merge pull request #400 from smalot/fix-secured-file
Browse files Browse the repository at this point in the history
Add support for PDF 1.5 Xref stream
  • Loading branch information
smalot authored Mar 11, 2021
2 parents 83061ea + e4b062a commit 7d3c67b
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 70 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@
debug*
composer.lock
/.php_cs.cache
/.phpunit.result.cache
Binary file added samples/bugs/Issue356.pdf
Binary file not shown.
153 changes: 83 additions & 70 deletions src/Smalot/PdfParser/RawData/RawDataParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ protected function decodeXrefStream($pdfData, $startxref, $xref = [])
}
$valid_crs = false;
$columns = 0;
$predictor = null;
$sarr = $xrefcrs[0][1];
if (!\is_array($sarr)) {
$sarr = [];
Expand Down Expand Up @@ -310,83 +311,95 @@ protected function decodeXrefStream($pdfData, $startxref, $xref = [])

// decode data
if ($valid_crs && isset($xrefcrs[1][3][0])) {
// number of bytes in a row
$rowlen = ($columns + 1);
// convert the stream into an array of integers
$sdata = unpack('C*', $xrefcrs[1][3][0]);
// split the rows
$sdata = array_chunk($sdata, $rowlen);
// initialize decoded array
$ddata = [];
// initialize first row with zeros
$prev_row = array_fill(0, $rowlen, 0);
// for each row apply PNG unpredictor
foreach ($sdata as $k => $row) {
// initialize new row
$ddata[$k] = [];
// get PNG predictor value
$predictor = (10 + $row[0]);
// for each byte on the row
for ($i = 1; $i <= $columns; ++$i) {
// new index
$j = ($i - 1);
$row_up = $prev_row[$j];
if (1 == $i) {
$row_left = 0;
$row_upleft = 0;
} else {
$row_left = $row[($i - 1)];
$row_upleft = $prev_row[($j - 1)];
}
switch ($predictor) {
case 10: // PNG prediction (on encoding, PNG None on all rows)
$ddata[$k][$j] = $row[$i];
break;
if (null !== $predictor) {
// number of bytes in a row
$rowlen = ($columns + 1);
// convert the stream into an array of integers
$sdata = unpack('C*', $xrefcrs[1][3][0]);
// split the rows
$sdata = array_chunk($sdata, $rowlen);

// initialize decoded array
$ddata = [];
// initialize first row with zeros
$prev_row = array_fill(0, $rowlen, 0);
// for each row apply PNG unpredictor
foreach ($sdata as $k => $row) {
// initialize new row
$ddata[$k] = [];
// get PNG predictor value
$predictor = (10 + $row[0]);
// for each byte on the row
for ($i = 1; $i <= $columns; ++$i) {
// new index
$j = ($i - 1);
$row_up = $prev_row[$j];
if (1 == $i) {
$row_left = 0;
$row_upleft = 0;
} else {
$row_left = $row[($i - 1)];
$row_upleft = $prev_row[($j - 1)];
}
switch ($predictor) {
case 10: // PNG prediction (on encoding, PNG None on all rows)
$ddata[$k][$j] = $row[$i];
break;

case 11: // PNG prediction (on encoding, PNG Sub on all rows)
$ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
break;
case 11: // PNG prediction (on encoding, PNG Sub on all rows)
$ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
break;

case 12: // PNG prediction (on encoding, PNG Up on all rows)
$ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
break;
case 12: // PNG prediction (on encoding, PNG Up on all rows)
$ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
break;

case 13: // PNG prediction (on encoding, PNG Average on all rows)
$ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
break;
case 13: // PNG prediction (on encoding, PNG Average on all rows)
$ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
break;

case 14: // PNG prediction (on encoding, PNG Paeth on all rows)
// initial estimate
$p = ($row_left + $row_up - $row_upleft);
// distances
$pa = abs($p - $row_left);
$pb = abs($p - $row_up);
$pc = abs($p - $row_upleft);
$pmin = min($pa, $pb, $pc);
// return minimum distance
switch ($pmin) {
case $pa:
$ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
break;

case $pb:
$ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
break;

case $pc:
$ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
break;
}
break;
case 14: // PNG prediction (on encoding, PNG Paeth on all rows)
// initial estimate
$p = ($row_left + $row_up - $row_upleft);
// distances
$pa = abs($p - $row_left);
$pb = abs($p - $row_up);
$pc = abs($p - $row_upleft);
$pmin = min($pa, $pb, $pc);
// return minimum distance
switch ($pmin) {
case $pa:
$ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
break;

case $pb:
$ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
break;

case $pc:
$ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
break;
}
break;

default: // PNG prediction (on encoding, PNG optimum)
throw new Exception('Unknown PNG predictor');
default: // PNG prediction (on encoding, PNG optimum)
throw new Exception('Unknown PNG predictor: '.$predictor);
}
}
}
$prev_row = $ddata[$k];
} // end for each row
// complete decoding
$prev_row = $ddata[$k];
} // end for each row
// complete decoding
} else {
// number of bytes in a row
$rowlen = array_sum($wb);
// convert the stream into an array of integers
$sdata = unpack('C*', $xrefcrs[1][3][0]);
// split the rows
$ddata = array_chunk($sdata, $rowlen);
}

$sdata = [];

// for every row
foreach ($ddata as $k => $row) {
// initialize new row
Expand Down
23 changes: 23 additions & 0 deletions tests/Integration/RawData/RawDataParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -84,4 +84,27 @@ public function testGetRawObjectIssue372()
$result
);
}

/**
* Tests buggy behavior of decodeXrefStream.
*
* @see https://github.com/smalot/pdfparser/issues/30
* @see https://github.com/smalot/pdfparser/issues/192
* @see https://github.com/smalot/pdfparser/issues/209
* @see https://github.com/smalot/pdfparser/issues/330
* @see https://github.com/smalot/pdfparser/issues/356
* @see https://github.com/smalot/pdfparser/issues/373
* @see https://github.com/smalot/pdfparser/issues/392
* @see https://github.com/smalot/pdfparser/issues/397
*/
public function testDecodeXrefStreamIssue356()
{
$filename = $this->rootDir.'/samples/bugs/Issue356.pdf';

$parser = $this->getParserInstance();
$document = $parser->parseFile($filename);
$pages = $document->getPages();

$this->assertStringContainsString('Ημερήσια έκθεση επιδημιολογικής', $pages[0]->getText());
}
}

0 comments on commit 7d3c67b

Please sign in to comment.