From 44cb8af0aa0a368c3c1186d03b85666c3638962c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Fazekas?= Date: Tue, 3 May 2022 12:43:35 +0200 Subject: [PATCH 1/2] HP Scan invalid Length workaround --- lib/combine_pdf/parser.rb | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/lib/combine_pdf/parser.rb b/lib/combine_pdf/parser.rb index 2689750..1ae3116 100644 --- a/lib/combine_pdf/parser.rb +++ b/lib/combine_pdf/parser.rb @@ -34,6 +34,7 @@ class PDFParser attr_reader :info_object, :root_object, :names_object, :forms_object, :outlines_object, :metadata attr_reader :allow_optional_content, :raise_on_encrypted + attr_reader :relaxed # when creating a parser, it is important to set the data (String) we wish to parse. # # the data is required and it is not possible to set the data at a later stage @@ -59,6 +60,7 @@ def initialize(string, options = {}) @scanner = nil @allow_optional_content = options[:allow_optional_content] @raise_on_encrypted = options[:raise_on_encrypted] + @relaxed = options[:relaxed] end # parse the data in the new parser (the data already set through the initialize / new method) @@ -363,7 +365,21 @@ def _parse_ # advance by the publshed stream length (if any) old_pos = @scanner.pos if(out.last.is_a?(Hash) && out.last[:Length].is_a?(Integer) && out.last[:Length] > 2) - @scanner.pos += out.last[:Length] - 2 + begin + @scanner.pos += out.last[:Length] - 2 + rescue RangeError => error + raise error unless @relaxed + oldpos = @scanner.pos + skipped = @scanner.skip_until(/endstream/) + if skipped + len = skipped - 'endstream'.length + warn "CombinePDF parser: invalid length: #{out.last[:Length]} for object: #{out.last} should be: #{len}" + @scanner.pos = oldpos + @scanner.pos += len + else + raise ParsingError, "Parsing Error: PDF file error - a stream object with invalid length of #{out.last[:Length]} for object #{out.last} and no endstream found, to work around it" + end + end end # the following was dicarded because some PDF files didn't have an EOL marker as required From c450a0559eedc35f792623503061ff29bbbc2c52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Fazekas?= Date: Wed, 15 Nov 2023 10:25:13 +0100 Subject: [PATCH 2/2] Use check stream length to check and correct stream lengths if needed --- lib/combine_pdf/parser.rb | 43 +++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/lib/combine_pdf/parser.rb b/lib/combine_pdf/parser.rb index 1ae3116..99340c2 100644 --- a/lib/combine_pdf/parser.rb +++ b/lib/combine_pdf/parser.rb @@ -34,7 +34,7 @@ class PDFParser attr_reader :info_object, :root_object, :names_object, :forms_object, :outlines_object, :metadata attr_reader :allow_optional_content, :raise_on_encrypted - attr_reader :relaxed + attr_reader :check_stream_length # when creating a parser, it is important to set the data (String) we wish to parse. # # the data is required and it is not possible to set the data at a later stage @@ -60,7 +60,7 @@ def initialize(string, options = {}) @scanner = nil @allow_optional_content = options[:allow_optional_content] @raise_on_encrypted = options[:raise_on_encrypted] - @relaxed = options[:relaxed] + @check_stream_length = options[:check_stream_length] end # parse the data in the new parser (the data already set through the initialize / new method) @@ -365,20 +365,10 @@ def _parse_ # advance by the publshed stream length (if any) old_pos = @scanner.pos if(out.last.is_a?(Hash) && out.last[:Length].is_a?(Integer) && out.last[:Length] > 2) - begin + if @check_stream_length + advance_pos_with_length_check(@scanner, out.last[:Length], out.last) + else @scanner.pos += out.last[:Length] - 2 - rescue RangeError => error - raise error unless @relaxed - oldpos = @scanner.pos - skipped = @scanner.skip_until(/endstream/) - if skipped - len = skipped - 'endstream'.length - warn "CombinePDF parser: invalid length: #{out.last[:Length]} for object: #{out.last} should be: #{len}" - @scanner.pos = oldpos - @scanner.pos += len - else - raise ParsingError, "Parsing Error: PDF file error - a stream object with invalid length of #{out.last[:Length]} for object #{out.last} and no endstream found, to work around it" - end end end @@ -523,6 +513,29 @@ def _parse_ protected + def advance_pos_with_length_check(scanner, length, obj) + endstream = 'endstream' + orig_pos = scanner.pos + if scanner.rest_size > length + scanner.pos += length + if scanner.check(endstream) + scanner.pos -= 2 + return + end + warn "Invalid length no #{endstream} found - object: #{obj}!" + else + warn "Invalid length in stream points out of the file - object: #{obj}!" + end + scanner.pos = orig_pos + skipped = scanner.skip_until(/endstream/) + if skipped + correct_len = skipped - endstream.length + scanner.pos -= endstream.length + 2 + else + raise ParsingError, "Parsing Error: PDF file error - a stream object with invalid length of #{length} for object #{out} and no #{endstream} found after, to work around it" + end + end + # resets cataloging and pages def catalog_pages(catalogs = nil, inheritance_hash = {}) unless catalogs