From c81cbe113c513e2e81b9eaea7761a67163320e4a Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Sun, 26 Aug 2018 01:33:56 +0200 Subject: [PATCH 1/2] Extract the "scanning for endstream command" part of `Parser.makeStream` into a helper method With this code now living in a separate method, it can be simplified slightly (e.g. by using early returns). --- src/core/parser.js | 84 ++++++++++++++++++++++++---------------------- 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/src/core/parser.js b/src/core/parser.js index 68bbe7e0cdf2b..ac864819e6330 100644 --- a/src/core/parser.js +++ b/src/core/parser.js @@ -471,13 +471,45 @@ var Parser = (function ParserClosure() { return imageStream; }, + + _findStreamLength(startPos, signature) { + const { stream, } = this.lexer; + stream.pos = startPos; + + const SCAN_BLOCK_LENGTH = 2048; + const signatureLength = signature.length; + + while (stream.pos < stream.end) { + const scanBytes = stream.peekBytes(SCAN_BLOCK_LENGTH); + const scanLength = scanBytes.length - signatureLength; + + if (scanLength <= 0) { + break; + } + let pos = 0; + while (pos < scanLength) { + let j = 0; + while (j < signatureLength && scanBytes[pos + j] === signature[j]) { + j++; + } + if (j >= signatureLength) { // `signature` found. + stream.pos += pos; + return (stream.pos - startPos); + } + pos++; + } + stream.pos += scanLength; + } + return -1; + }, + makeStream: function Parser_makeStream(dict, cipherTransform) { var lexer = this.lexer; var stream = lexer.stream; // get stream start position lexer.skipToNextLine(); - var pos = stream.pos - 1; + const startPos = stream.pos - 1; // get length var length = dict.get('Length'); @@ -487,52 +519,22 @@ var Parser = (function ParserClosure() { } // skip over the stream data - stream.pos = pos + length; + stream.pos = startPos + length; lexer.nextChar(); // Shift '>>' and check whether the new object marks the end of the stream if (this.tryShift() && isCmd(this.buf2, 'endstream')) { this.shift(); // 'stream' } else { - // bad stream length, scanning for endstream - stream.pos = pos; - var SCAN_BLOCK_SIZE = 2048; - var ENDSTREAM_SIGNATURE_LENGTH = 9; - var ENDSTREAM_SIGNATURE = [0x65, 0x6E, 0x64, 0x73, 0x74, 0x72, 0x65, - 0x61, 0x6D]; - var skipped = 0, found = false, i, j; - while (stream.pos < stream.end) { - var scanBytes = stream.peekBytes(SCAN_BLOCK_SIZE); - var scanLength = scanBytes.length - ENDSTREAM_SIGNATURE_LENGTH; - if (scanLength <= 0) { - break; - } - found = false; - i = 0; - while (i < scanLength) { - j = 0; - while (j < ENDSTREAM_SIGNATURE_LENGTH && - scanBytes[i + j] === ENDSTREAM_SIGNATURE[j]) { - j++; - } - if (j >= ENDSTREAM_SIGNATURE_LENGTH) { - found = true; - break; - } - i++; - } - if (found) { - skipped += i; - stream.pos += i; - break; - } - skipped += scanLength; - stream.pos += scanLength; - } - if (!found) { - throw new FormatError('Missing endstream'); + // Bad stream length, scanning for endstream command. + const ENDSTREAM_SIGNATURE = new Uint8Array([ + 0x65, 0x6E, 0x64, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6D]); + let actualLength = this._findStreamLength(startPos, + ENDSTREAM_SIGNATURE); + if (actualLength < 0) { + throw new FormatError('Missing endstream command.'); } - length = skipped; + length = actualLength; lexer.nextChar(); this.shift(); @@ -540,7 +542,7 @@ var Parser = (function ParserClosure() { } this.shift(); // 'endstream' - stream = stream.makeSubStream(pos, length, dict); + stream = stream.makeSubStream(startPos, length, dict); if (cipherTransform) { stream = cipherTransform.createStream(stream, length); } From 95e5bad4c4ab6c99f14b2da2822375de2a7550bf Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Sun, 26 Aug 2018 01:49:31 +0200 Subject: [PATCH 2/2] Attempt to find truncated endstream commands, in the fallback code-path, in `Parser.makeStream` (issue 10004) Apparently there's some PDF generators, in this case the culprit is "Nooog Pdf Library / Nooog PStoPDF v1.5", that manage to mess up PDF creation enough that endstream[1] commands actually become truncated. *Please note:* The solution implemented here isn't perfect, since it won't be able to cope with PDF files that contains a *mixture* of correct and truncated endstream commands. However, considering that this particular mode of corruption *fortunately* doesn't seem very common[2], a slightly less complex solution ought to suffice for now. Fixes 10004. --- [1] Scanning through the PDF data to find endstream commands becomes necessary, in order to determine the stream length in cases where the `Length` entry of the (stream) dictionary is missing/incorrect. [2] I cannot recall having seen any (previous) issues/bugs with "Missing endstream" errors. --- src/core/parser.js | 33 ++++++++++++++++++++++++++++++--- test/pdfs/issue10004.pdf.link | 1 + test/test_manifest.json | 7 +++++++ 3 files changed, 38 insertions(+), 3 deletions(-) create mode 100644 test/pdfs/issue10004.pdf.link diff --git a/src/core/parser.js b/src/core/parser.js index ac864819e6330..10b8b0c5fd074 100644 --- a/src/core/parser.js +++ b/src/core/parser.js @@ -18,8 +18,8 @@ import { PredictorStream, RunLengthStream } from './stream'; import { - assert, FormatError, info, isNum, isSpace, isString, MissingDataException, - StreamType, warn + assert, bytesToString, FormatError, info, isNum, isSpace, isString, + MissingDataException, StreamType, warn } from '../shared/util'; import { Cmd, Dict, EOF, isCmd, isDict, isEOF, isName, Name, Ref @@ -532,7 +532,34 @@ var Parser = (function ParserClosure() { let actualLength = this._findStreamLength(startPos, ENDSTREAM_SIGNATURE); if (actualLength < 0) { - throw new FormatError('Missing endstream command.'); + // Only allow limited truncation of the endstream signature, + // to prevent false positives. + const MAX_TRUNCATION = 1; + // Check if the PDF generator included truncated endstream commands, + // such as e.g. "endstrea" (fixes issue10004.pdf). + for (let i = 1; i <= MAX_TRUNCATION; i++) { + const end = ENDSTREAM_SIGNATURE.length - i; + const TRUNCATED_SIGNATURE = ENDSTREAM_SIGNATURE.slice(0, end); + + let maybeLength = this._findStreamLength(startPos, + TRUNCATED_SIGNATURE); + if (maybeLength >= 0) { + // Ensure that the byte immediately following the truncated + // endstream command is a space, to prevent false positives. + const lastByte = stream.peekBytes(end + 1)[end]; + if (!isSpace(lastByte)) { + break; + } + info(`Found "${bytesToString(TRUNCATED_SIGNATURE)}" when ` + + 'searching for endstream command.'); + actualLength = maybeLength; + break; + } + } + + if (actualLength < 0) { + throw new FormatError('Missing endstream command.'); + } } length = actualLength; diff --git a/test/pdfs/issue10004.pdf.link b/test/pdfs/issue10004.pdf.link new file mode 100644 index 0000000000000..4e3abde0dadcc --- /dev/null +++ b/test/pdfs/issue10004.pdf.link @@ -0,0 +1 @@ +https://github.com/mozilla/pdf.js/files/2315390/2371410.pdf diff --git a/test/test_manifest.json b/test/test_manifest.json index e97557e79f3c8..2a0551496dbf8 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -726,6 +726,13 @@ "link": false, "type": "load" }, + { "id": "issue10004", + "file": "pdfs/issue10004.pdf", + "md5": "64d1853060cefe3be50e5c4617dd0505", + "rounds": 1, + "link": true, + "type": "load" + }, { "id": "issue7507", "file": "pdfs/issue7507.pdf", "md5": "f7aeaafe0c89b94436e94eaa63307303",