diff --git a/samples/bugs/Issue673.pdf b/samples/bugs/Issue673.pdf new file mode 100644 index 00000000..a2138b51 Binary files /dev/null and b/samples/bugs/Issue673.pdf differ diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index b136afb1..5e17083a 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -864,39 +864,39 @@ private function getHeaderValue(?array $headerDic, string $key, string $type, $d */ protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array { - $startxrefPreg = preg_match( - '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i', + // If the $offset is currently pointed at whitespace, bump it + // forward until it isn't; affects loosely targetted offsets + // for the 'xref' keyword + // See: https://github.com/smalot/pdfparser/issues/673 + $bumpOffset = $offset; + while (preg_match('/\s/', substr($pdfData, $bumpOffset, 1))) { + ++$bumpOffset; + } + + // Find all startxref tables from this $offset forward + $startxrefPreg = preg_match_all( + '/(?<=[\r\n])startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i', $pdfData, - $matches, - \PREG_OFFSET_CAPTURE, + $startxrefMatches, + \PREG_SET_ORDER, $offset ); - if (0 == $offset) { - // find last startxref - $pregResult = preg_match_all( - '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i', - $pdfData, - $matches, - \PREG_SET_ORDER, - $offset - ); - if (0 == $pregResult) { - throw new \Exception('Unable to find startxref'); - } - $matches = array_pop($matches); - $startxref = $matches[1]; - } elseif (strpos($pdfData, 'xref', $offset) == $offset) { + if (0 == $startxrefPreg) { + // No startxref tables were found + throw new \Exception('Unable to find startxref'); + } elseif (0 == $offset) { + // Use the last startxref in the document + $startxref = (int) $startxrefMatches[\count($startxrefMatches) - 1][1]; + } elseif (strpos($pdfData, 'xref', $bumpOffset) == $bumpOffset) { // Already pointing at the xref table - $startxref = $offset; - } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) { + $startxref = $bumpOffset; + } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, 0, $bumpOffset)) { // Cross-Reference Stream object - $startxref = $offset; - } elseif ($startxrefPreg) { - // startxref found - $startxref = $matches[1][0]; + $startxref = $bumpOffset; } else { - throw new \Exception('Unable to find startxref'); + // Use the next startxref from this $offset + $startxref = (int) $startxrefMatches[0][1]; } if ($startxref > \strlen($pdfData)) { diff --git a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php index dec70977..7a586932 100644 --- a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php +++ b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php @@ -172,4 +172,26 @@ public function testDecodeXrefStreamIssue479(): void $this->assertArrayHasKey('Subject', $details); $this->assertArrayHasKey('Title', $details); } + + /** + * Account for inaccurate offset values in getXrefData. + * + * Normally offset values extracted from the PDF document are exact. + * However in some cases, they may point to whitespace *before* a + * valid xref keyword. Move the offset forward past whitespace to + * make this function a little more lenient. + * + * @see https://github.com/smalot/pdfparser/issues/673 + */ + public function testGetXrefDataIssue673(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue673.pdf'; + + // Parsing this document would previously throw an Exception + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $text = $document->getText(); + + self::assertStringContainsString('6 rue des Goutais', $text); + } }