From 6f0ef9c309b3a9789d5fdf498a4d4b83e50bcd54 Mon Sep 17 00:00:00 2001 From: Brian Huisman Date: Mon, 25 Mar 2024 11:40:13 -0400 Subject: [PATCH] Add PCRE dotall modifier Add the /s modifier so the `.` token matches newlines as well. Thanks to @iGrog for supplying another PDF that demonstrated this issue. Add the same modifier for dictionaries as well, fixing this oversight. Move the inline image replacement before string replacement. Parentheses in binary image data may be interpreted as the start of a string. Move the inline images test to its own function and add a newline to the sample data to test for the dotall modifier change. --- src/Smalot/PdfParser/PDFObject.php | 50 ++++++++++----------- tests/PHPUnit/Integration/PDFObjectTest.php | 16 +++++-- 2 files changed, 38 insertions(+), 28 deletions(-) diff --git a/src/Smalot/PdfParser/PDFObject.php b/src/Smalot/PdfParser/PDFObject.php index b9ac62d1..20e92334 100644 --- a/src/Smalot/PdfParser/PDFObject.php +++ b/src/Smalot/PdfParser/PDFObject.php @@ -224,6 +224,20 @@ private function formatContent(?string $content): string return ''; } + // Find all inline image content and replace them so they aren't + // affected by the next steps + $pdfInlineImages = []; + while (preg_match('/\sBI\s(.+?)\sID\s(.+?)\sEI(?=\s|$)/s', $content, $text)) { + $id = uniqid('IMAGE_', true); + $pdfInlineImages[$id] = [$text[1], $text[2]]; + $content = preg_replace( + '/'.preg_quote($text[0], '/').'/', + '^^^'.$id.'^^^', + $content, + 1 + ); + } + // Find all strings () and replace them so they aren't affected // by the next steps $pdfstrings = []; @@ -254,27 +268,13 @@ private function formatContent(?string $content): string } } - // Find all inline image content and replace them so they aren't - // affected by the next steps - $pdfInlineImages = []; - while (preg_match('/\sBI(.+?)\sID\s(.+?)\sEI(?=\s|$)/', $content, $text)) { - $id = uniqid('IMAGE_', true); - $pdfInlineImages[$id] = [$text[1], $text[2]]; - $content = preg_replace( - '/'.preg_quote($text[0], '/').'/', - '^^^'.$id.'^^^', - $content, - 1 - ); - } - // Remove all carriage returns and line-feeds from the document stream $content = str_replace(["\r", "\n"], ' ', trim($content)); // Find all dictionary << >> commands and replace them so they // aren't affected by the next steps $dictstore = []; - while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/', $content, $dicttext)) { + while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/s', $content, $dicttext)) { $dictid = uniqid('DICT_', true); $dictstore[$dictid] = $dicttext[1]; $content = preg_replace( @@ -317,16 +317,6 @@ private function formatContent(?string $content): string $content = str_replace('###'.$id.'###', $dict, $content); } - // Restore the original content of any inline images - $pdfInlineImages = array_reverse($pdfInlineImages, true); - foreach ($pdfInlineImages as $id => $image) { - $content = str_replace( - '^^^'.$id.'^^^', - "\r\nBI\r\n".$image[0]."\r\nID\r\n".$image[1]."\r\nEI\r\n", - $content - ); - } - // Restore the original string content $pdfstrings = array_reverse($pdfstrings, true); foreach ($pdfstrings as $id => $text) { @@ -343,6 +333,16 @@ private function formatContent(?string $content): string $content = str_replace('@@@'.$id.'@@@', $text, $content); } + // Restore the original content of any inline images + $pdfInlineImages = array_reverse($pdfInlineImages, true); + foreach ($pdfInlineImages as $id => $image) { + $content = str_replace( + '^^^'.$id.'^^^', + "\r\nBI\r\n".$image[0]."\r\nID\r\n".$image[1]."\r\nEI\r\n", + $content + ); + } + $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content)); return $content; diff --git a/tests/PHPUnit/Integration/PDFObjectTest.php b/tests/PHPUnit/Integration/PDFObjectTest.php index 39079a2c..a2ee699f 100644 --- a/tests/PHPUnit/Integration/PDFObjectTest.php +++ b/tests/PHPUnit/Integration/PDFObjectTest.php @@ -284,12 +284,22 @@ public function testFormatContent(): void // Binary check is done before a regexp that causes an error $this->assertStringContainsString('Marko Nestorović PR', $pages[0]->getText()); + } + + /** + * Check that inline image data does not corrupt the stream + * + * @see: https://github.com/smalot/pdfparser/issues/691 + */ + public function testFormatContentInlineImages(): void + { + $formatContent = new \ReflectionMethod('Smalot\PdfParser\PDFObject', 'formatContent'); + $formatContent->setAccessible(true); - // Check that inline image data does not corrupt the stream - // See: https://github.com/smalot/pdfparser/issues/691 $cleaned = $formatContent->invoke( $this->getPdfObjectInstance(new Document()), - 'q 65.30 0 0 18.00 412 707 cm BI /W 544 /H 150 /BPC 1 /IM true /F [/A85 /Fl] ID Gb"0F_$L6!$j/a\$:ma&h\'JnJJ9S?O_EA-W+%D^ClCH=FP3s5M-gStQm\'5/hc`C?CqL(^pV$_-er6Ik`"-1]Q ;~> EI Q /F002 10.00 Tf 0.00 Tw 0 g' + 'q 65.30 0 0 18.00 412 707 cm BI /W 544 /H 150 +/BPC 1 /IM true /F [/A85 /Fl] ID Gb"0F_$L6!$j/a\$:ma&h\'JnJJ9S?O_EA-W+%D^ClCH=FP3s5M-gStQm\'5/hc`C?CqL(^pV$_-er6Ik`"-1]Q ;~> EI Q /F002 10.00 Tf 0.00 Tw 0 g' ); // PdfParser should not be fooled by Q's in inline image data;