smalot · k00ni · May 13, 2024 · Mar 15, 2024 · Mar 15, 2024 · Mar 25, 2024
diff --git a/src/Smalot/PdfParser/PDFObject.php b/src/Smalot/PdfParser/PDFObject.php
@@ -254,6 +254,20 @@ private function formatContent(?string $content): string
             }
         }
 
+        // Find all inline image content and replace them so they aren't
+        // affected by the next steps
+        $pdfInlineImages = [];
+        while (preg_match('/\sBI(.+?)\sID\s(.+?)\sEI(?=\s|$)/', $content, $text)) {
+            $id = uniqid('IMAGE_', true);
+            $pdfInlineImages[$id] = [$text[1], $text[2]];
+            $content = preg_replace(
+                '/'.preg_quote($text[0], '/').'/',
+                '^^^'.$id.'^^^',
+                $content,
+                1
+            );
+        }
+
         // Remove all carriage returns and line-feeds from the document stream
         $content = str_replace(["\r", "\n"], ' ', trim($content));
 
@@ -303,6 +317,16 @@ private function formatContent(?string $content): string
             $content = str_replace('###'.$id.'###', $dict, $content);
         }
 
+        // Restore the original content of any inline images
+        $pdfInlineImages = array_reverse($pdfInlineImages, true);
+        foreach ($pdfInlineImages as $id => $image) {
+            $content = str_replace(
+                '^^^'.$id.'^^^',
+                "\r\nBI\r\n".$image[0]."\r\nID\r\n".$image[1]."\r\nEI\r\n",
+                $content
+            );
+        }
+
         // Restore the original string content
         $pdfstrings = array_reverse($pdfstrings, true);
         foreach ($pdfstrings as $id => $text) {

diff --git a/tests/PHPUnit/Integration/PDFObjectTest.php b/tests/PHPUnit/Integration/PDFObjectTest.php
@@ -284,6 +284,19 @@ public function testFormatContent(): void
 
         // Binary check is done before a regexp that causes an error
         $this->assertStringContainsString('Marko Nestorović PR', $pages[0]->getText());
+
+        // Check that inline image data does not corrupt the stream
+        // See: https://github.com/smalot/pdfparser/issues/691
+        $cleaned = $formatContent->invoke(
+            $this->getPdfObjectInstance(new Document()),
+            'q 65.30 0 0 18.00 412 707 cm BI /W 544 /H 150 /BPC 1 /IM true /F [/A85 /Fl] ID Gb"0F_$L6!$j/a\$:ma&h\'JnJJ9S?O_EA-W+%D^ClCH=FP3s5M-gStQm\'5/hc`C?<Q)riWgtEe:Po0dY_-er6$jM@#?n`E+#(sa"0Gk3&K>CqL(^pV$_-er6Ik`"-1]Q ;~> EI Q /F002 10.00 Tf 0.00 Tw 0 g'
+        );
+
+        // PdfParser should not be fooled by Q's in inline image data;
+        // Only one 'Q' command should be found
+        $commandQ = preg_match_all('/Q\r\n/', $cleaned);
+
+        $this->assertEquals(1, $commandQ);
     }
 
     public function testGetSectionsText(): void