Account for inaccurate offsets in getXrefData() (#692)

* Account for inaccurate offsets in getXrefData() Normally offset pointers to `xref` keywords in a PDF document are exact to the byte. However, in some cases the pointer may point to some whitespace *before* the `xref` keyword. Adobe Acrobat takes these 'errors' in stride, displaying the document anyway, and so should PdfParser. Clean up the getXrefData() function in **RawDataParser.php**. It now only needs to do one `preg_match_all()` and pushes the caret past whitespace when looking for `xref` keywords. Use existing **Issue557.pdf** to create a new file: **Issue673.pdf** where the last `/Prev 13486` command has been decremented to `/Prev 13485`. Trying to parse this file would cause an Exception without this fix. * Drop unnecessary PREG_OFFSET_CAPTURE No need to use `PREG_OFFSET_CAPTURE` here. --------- Co-authored-by: Konrad Abicht <[email protected]>
smalot · Apr 2, 2024 · fb77eab · fb77eab
1 parent ed3fc0b
commit fb77eab
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 26 deletions.
diff --git a/samples/bugs/Issue673.pdf b/samples/bugs/Issue673.pdf
diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php
@@ -864,39 +864,39 @@ private function getHeaderValue(?array $headerDic, string $key, string $type, $d
      */
     protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
     {
-        $startxrefPreg = preg_match(
-            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
+        // If the $offset is currently pointed at whitespace, bump it
+        // forward until it isn't; affects loosely targetted offsets
+        // for the 'xref' keyword
+        // See: https://github.com/smalot/pdfparser/issues/673
+        $bumpOffset = $offset;
+        while (preg_match('/\s/', substr($pdfData, $bumpOffset, 1))) {
+            ++$bumpOffset;
+        }
+
+        // Find all startxref tables from this $offset forward
+        $startxrefPreg = preg_match_all(
+            '/(?<=[\r\n])startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
             $pdfData,
-            $matches,
-            \PREG_OFFSET_CAPTURE,
+            $startxrefMatches,
+            \PREG_SET_ORDER,
             $offset
         );
 
-        if (0 == $offset) {
-            // find last startxref
-            $pregResult = preg_match_all(
-                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
-                $pdfData,
-                $matches,
-                \PREG_SET_ORDER,
-                $offset
-            );
-            if (0 == $pregResult) {
-                throw new \Exception('Unable to find startxref');
-            }
-            $matches = array_pop($matches);
-            $startxref = $matches[1];
-        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
+        if (0 == $startxrefPreg) {
+            // No startxref tables were found
+            throw new \Exception('Unable to find startxref');
+        } elseif (0 == $offset) {
+            // Use the last startxref in the document
+            $startxref = (int) $startxrefMatches[\count($startxrefMatches) - 1][1];
+        } elseif (strpos($pdfData, 'xref', $bumpOffset) == $bumpOffset) {
             // Already pointing at the xref table
-            $startxref = $offset;
-        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
+            $startxref = $bumpOffset;
+        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, 0, $bumpOffset)) {
             // Cross-Reference Stream object
-            $startxref = $offset;
-        } elseif ($startxrefPreg) {
-            // startxref found
-            $startxref = $matches[1][0];
+            $startxref = $bumpOffset;
         } else {
-            throw new \Exception('Unable to find startxref');
+            // Use the next startxref from this $offset
+            $startxref = (int) $startxrefMatches[0][1];
         }
 
         if ($startxref > \strlen($pdfData)) {

diff --git a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php
@@ -172,4 +172,26 @@ public function testDecodeXrefStreamIssue479(): void
         $this->assertArrayHasKey('Subject', $details);
         $this->assertArrayHasKey('Title', $details);
     }
+
+    /**
+     * Account for inaccurate offset values in getXrefData.
+     *
+     * Normally offset values extracted from the PDF document are exact.
+     * However in some cases, they may point to whitespace *before* a
+     * valid xref keyword. Move the offset forward past whitespace to
+     * make this function a little more lenient.
+     *
+     * @see https://github.com/smalot/pdfparser/issues/673
+     */
+    public function testGetXrefDataIssue673(): void
+    {
+        $filename = $this->rootDir.'/samples/bugs/Issue673.pdf';
+
+        // Parsing this document would previously throw an Exception
+        $parser = $this->getParserInstance();
+        $document = $parser->parseFile($filename);
+        $text = $document->getText();
+
+        self::assertStringContainsString('6 rue des Goutais', $text);
+    }
 }