From 53538eb33efc11fb1f81cef9729aed284d0a1dff Mon Sep 17 00:00:00 2001 From: Brian Huisman Date: Thu, 24 Aug 2023 10:14:51 -0400 Subject: [PATCH] Check for wrong line-endings when getting xref (#635) If we didn't find the `xref` command at the offset specified, then replace Windows `\r\n` line endings with Unix style `\n` and try again. If it succeeds, then edit the line-endings and proceed as normal. Otherwise continue on to the `decodeXrefStream()` method. Fixes parsing of existing test suite file **/samples/bugs/Issue95_ANSI.pdf** the test for which would normally be passed over because of the `@group linux-only` flag. Remove this flag, as all assertions in the `testDecodeText()` function now resolve as true in any environment. --- src/Smalot/PdfParser/RawData/RawDataParser.php | 17 +++++++++++++++-- tests/PHPUnit/Integration/FontTest.php | 3 --- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index 1a4583c0..ec8f600b 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -901,8 +901,15 @@ protected function getXrefData(string $pdfData, int $offset = 0, array $xref = [ // Cross-Reference $xref = $this->decodeXref($pdfData, $startxref, $xref); } else { - // Cross-Reference Stream - $xref = $this->decodeXrefStream($pdfData, $startxref, $xref); + // Check if the $pdfData might have the wrong line-endings + $pdfDataUnix = str_replace("\r\n", "\n", $pdfData); + if ($startxref < \strlen($pdfDataUnix) && strpos($pdfDataUnix, 'xref', $startxref) == $startxref) { + // Return Unix-line-ending flag + $xref = ['Unix' => true]; + } else { + // Cross-Reference Stream + $xref = $this->decodeXrefStream($pdfData, $startxref, $xref); + } } if (empty($xref)) { throw new \Exception('Unable to find xref'); @@ -937,6 +944,12 @@ public function parseData(string $data): array // get xref and trailer data $xref = $this->getXrefData($pdfData); + // If we found Unix line-endings + if (isset($xref['Unix'])) { + $pdfData = str_replace("\r\n", "\n", $pdfData); + $xref = $this->getXrefData($pdfData); + } + // parse all document objects $objects = []; foreach ($xref['xref'] as $obj => $offset) { diff --git a/tests/PHPUnit/Integration/FontTest.php b/tests/PHPUnit/Integration/FontTest.php index b07bbf76..599a4203 100644 --- a/tests/PHPUnit/Integration/FontTest.php +++ b/tests/PHPUnit/Integration/FontTest.php @@ -294,9 +294,6 @@ public function testDecodeUnicode(): void $this->assertEquals('AB', Font::decodeUnicode("\xFE\xFF\x00A\x00B")); } - /** - * @group linux-only - */ public function testDecodeText(): void { $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';