From cba5b04d9cf9e1205a54553e3f1fda93544eb451 Mon Sep 17 00:00:00 2001 From: Daniel Kahn Gillmor Date: Fri, 15 Jul 2022 19:01:56 -0400 Subject: [PATCH] Avoid a crash when a ToUnicode CMap has an empty dstString in beginbfchar This is not a principled fix, but it is a hack to avoid a crash when encountering an empty dstString in a `beginbfchar` table in a ToUnicode CMap. The right way to fix this would be to replace all the string manipulation with a formal grammar, but i don't have the skill or capacity to do that right now. Instead, we take narrow aim at the issue of zero-length (empty) hex string representations. We take advantage of the fact that no angle-bracket-delimited hex string contains a . character. when we encounter an empty hex string, rather than replacing it with the empty string, we replace it with a literal ".". Then, when we encounter a ".", we remember that it was supposed to be an empty string. One consequence of this fix is that the exported cmap can now return an empty string, so we also have to clean up `PageObject::process_operation` so that it doesn't try to read the final character from an empty string. This is a hackish workaround for #1111. --- PyPDF2/_cmap.py | 18 ++++++++++++++---- PyPDF2/_page.py | 1 + 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py index 2c5ec2ee5..72bb6c1ad 100644 --- a/PyPDF2/_cmap.py +++ b/PyPDF2/_cmap.py @@ -191,7 +191,13 @@ def parse_to_unicode( for i in range(len(ll)): j = ll[i].find(b">") if j >= 0: - ll[i] = ll[i][:j].replace(b" ", b"") + b" " + ll[i][j + 1 :] + if j == 0: + # string is empty: stash a placeholder here (see below) + # see https://github.com/py-pdf/PyPDF2/issues/1111 + content = b"." + else: + content = ll[i][:j].replace(b" ", b"") + ll[i] = content + b" " + ll[i][j + 1 :] cm = ( (b" ".join(ll)) .replace(b"[", b" [ ") @@ -246,13 +252,17 @@ def parse_to_unicode( lst = [x for x in l.split(b" ") if x] map_dict[-1] = len(lst[0]) // 2 while len(lst) > 1: + map_to = "" + # placeholder (see above) means empty string + if lst[1] != b".": + map_to = unhexlify(lst[1]).decode( + "utf-16-be", "surrogatepass" + ) # join is here as some cases where the code was split map_dict[ unhexlify(lst[0]).decode( "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass" ) - ] = unhexlify(lst[1]).decode( - "utf-16-be", "surrogatepass" - ) # join is here as some cases where the code was split + ] = map_to int_entry.append(int(lst[0], 16)) lst = lst[2:] for a, value in map_dict.items(): diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 54ca9982d..a728cd6e9 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -1377,6 +1377,7 @@ def process_operation(operator: bytes, operands: List) -> None: if ( (abs(float(op)) >= _space_width) and (abs(float(op)) <= 8 * _space_width) + and (len(text) > 0) and (text[-1] != " ") ): process_operation(b"Tj", [" "])