Skip to content

Commit

Permalink
Add PCRE dotall modifier
Browse files Browse the repository at this point in the history
Add the /s modifier so the `.` token matches newlines as well. Thanks to @iGrog for supplying another PDF that demonstrated this issue. Add the same modifier for dictionaries as well, fixing this oversight.

Move the inline image replacement before string replacement. Parentheses in binary image data may be interpreted as the start of a string.

Move the inline images test to its own function and add a newline to the sample data to test for the dotall modifier change.
  • Loading branch information
GreyWyvern committed Mar 25, 2024
1 parent 4ae52e7 commit 6f0ef9c
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 28 deletions.
50 changes: 25 additions & 25 deletions src/Smalot/PdfParser/PDFObject.php
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,20 @@ private function formatContent(?string $content): string
return '';
}

// Find all inline image content and replace them so they aren't
// affected by the next steps
$pdfInlineImages = [];
while (preg_match('/\sBI\s(.+?)\sID\s(.+?)\sEI(?=\s|$)/s', $content, $text)) {
$id = uniqid('IMAGE_', true);
$pdfInlineImages[$id] = [$text[1], $text[2]];
$content = preg_replace(
'/'.preg_quote($text[0], '/').'/',
'^^^'.$id.'^^^',
$content,
1
);
}

// Find all strings () and replace them so they aren't affected
// by the next steps
$pdfstrings = [];
Expand Down Expand Up @@ -254,27 +268,13 @@ private function formatContent(?string $content): string
}
}

// Find all inline image content and replace them so they aren't
// affected by the next steps
$pdfInlineImages = [];
while (preg_match('/\sBI(.+?)\sID\s(.+?)\sEI(?=\s|$)/', $content, $text)) {
$id = uniqid('IMAGE_', true);
$pdfInlineImages[$id] = [$text[1], $text[2]];
$content = preg_replace(
'/'.preg_quote($text[0], '/').'/',
'^^^'.$id.'^^^',
$content,
1
);
}

// Remove all carriage returns and line-feeds from the document stream
$content = str_replace(["\r", "\n"], ' ', trim($content));

// Find all dictionary << >> commands and replace them so they
// aren't affected by the next steps
$dictstore = [];
while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/', $content, $dicttext)) {
while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/s', $content, $dicttext)) {
$dictid = uniqid('DICT_', true);
$dictstore[$dictid] = $dicttext[1];
$content = preg_replace(
Expand Down Expand Up @@ -317,16 +317,6 @@ private function formatContent(?string $content): string
$content = str_replace('###'.$id.'###', $dict, $content);
}

// Restore the original content of any inline images
$pdfInlineImages = array_reverse($pdfInlineImages, true);
foreach ($pdfInlineImages as $id => $image) {
$content = str_replace(
'^^^'.$id.'^^^',
"\r\nBI\r\n".$image[0]."\r\nID\r\n".$image[1]."\r\nEI\r\n",
$content
);
}

// Restore the original string content
$pdfstrings = array_reverse($pdfstrings, true);
foreach ($pdfstrings as $id => $text) {
Expand All @@ -343,6 +333,16 @@ private function formatContent(?string $content): string
$content = str_replace('@@@'.$id.'@@@', $text, $content);
}

// Restore the original content of any inline images
$pdfInlineImages = array_reverse($pdfInlineImages, true);
foreach ($pdfInlineImages as $id => $image) {
$content = str_replace(
'^^^'.$id.'^^^',
"\r\nBI\r\n".$image[0]."\r\nID\r\n".$image[1]."\r\nEI\r\n",
$content
);
}

$content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content));

return $content;
Expand Down
16 changes: 13 additions & 3 deletions tests/PHPUnit/Integration/PDFObjectTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -284,12 +284,22 @@ public function testFormatContent(): void

// Binary check is done before a regexp that causes an error
$this->assertStringContainsString('Marko Nestorović PR', $pages[0]->getText());
}

/**
* Check that inline image data does not corrupt the stream
*
* @see: https://github.com/smalot/pdfparser/issues/691
*/
public function testFormatContentInlineImages(): void
{
$formatContent = new \ReflectionMethod('Smalot\PdfParser\PDFObject', 'formatContent');
$formatContent->setAccessible(true);

// Check that inline image data does not corrupt the stream
// See: https://github.com/smalot/pdfparser/issues/691
$cleaned = $formatContent->invoke(
$this->getPdfObjectInstance(new Document()),
'q 65.30 0 0 18.00 412 707 cm BI /W 544 /H 150 /BPC 1 /IM true /F [/A85 /Fl] ID Gb"0F_$L6!$j/a\$:ma&h\'JnJJ9S?O_EA-W+%D^ClCH=FP3s5M-gStQm\'5/hc`C?<Q)riWgtEe:Po0dY_-er6$jM@#?n`E+#(sa"0Gk3&K>CqL(^pV$_-er6Ik`"-1]Q ;~> EI Q /F002 10.00 Tf 0.00 Tw 0 g'
'q 65.30 0 0 18.00 412 707 cm BI /W 544 /H 150
/BPC 1 /IM true /F [/A85 /Fl] ID Gb"0F_$L6!$j/a\$:ma&h\'JnJJ9S?O_EA-W+%D^ClCH=FP3s5M-gStQm\'5/hc`C?<Q)riWgtEe:Po0dY_-er6$jM@#?n`E+#(sa"0Gk3&K>CqL(^pV$_-er6Ik`"-1]Q ;~> EI Q /F002 10.00 Tf 0.00 Tw 0 g'
);

// PdfParser should not be fooled by Q's in inline image data;
Expand Down

0 comments on commit 6f0ef9c

Please sign in to comment.