From 5c4826190cceb58ab8b066758944f093b94e5b3c Mon Sep 17 00:00:00 2001 From: Brian Huisman Date: Tue, 26 Sep 2023 06:47:15 -0400 Subject: [PATCH] Better octal and hex-entity decode (#640) * Better octal and hex-entity decode Octal strings can include series of backslashes of arbitrary length. If there is an odd number of backslashes, a following octal code is valid, but if there's an even number, the following octal code should not be translated. Previously PdfParser would only account for two backslashes directly preceding an octal code. A commit from in-progress PR #634 extended this to three which probably covers 99.99% of all cases. This change ups that to 100% in that there could be a string with any number of backslashes in a row, and codes will be correctly translated. Also update decodeEntities() to use a preg_replace_callback() instead of the bulkier preg_split() + foreach loop. Make sure it matches all hexadecimal digits including a-f. Add new tests for both of these. * Use #2D to ensure we're capturing hex letters * Change order of special string replacement Move the special string replacement after the unescaping of parentheses so we don't unescape any parentheses we shouldn't. Add more tests to make sure this is working. * Apply suggestions from code review Co-authored-by: Konrad Abicht --------- Co-authored-by: Konrad Abicht --- src/Smalot/PdfParser/Font.php | 37 +++++++++++--------------- tests/PHPUnit/Integration/FontTest.php | 17 ++++++++++++ 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/src/Smalot/PdfParser/Font.php b/src/Smalot/PdfParser/Font.php index dbea6033..b6207bb8 100644 --- a/src/Smalot/PdfParser/Font.php +++ b/src/Smalot/PdfParser/Font.php @@ -349,18 +349,20 @@ public static function decodeHexadecimal(string $hexa, bool $add_braces = false) */ public static function decodeOctal(string $text): string { - $parts = preg_split('/(? '[**pdfparserdblslsh**]']); - foreach ($parts as $part) { - if (preg_match('/^\\\\[0-7]{1,3}$/', $part)) { - $text .= \chr(octdec(trim($part, '\\'))); - } else { - $text .= str_replace(['\\\\', '\\(', '\\)'], ['\\', '(', ')'], $part); - } - } + // Now we can replace all octal codes without worrying about + // escaped backslashes + $text = preg_replace_callback('/\\\\([0-7]{1,3})/', function ($m) { + return \chr(octdec($m[1])); + }, $text); - return $text; + // Unescape any parentheses + $text = str_replace(['\\(', '\\)'], ['(', ')'], $text); + + // Replace instances of the special string with a single backslash + return str_replace('[**pdfparserdblslsh**]', '\\', $text); } /** @@ -368,18 +370,9 @@ public static function decodeOctal(string $text): string */ public static function decodeEntities(string $text): string { - $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE); - $text = ''; - - foreach ($parts as $part) { - if (preg_match('/^#\d{2}$/', $part)) { - $text .= \chr(hexdec(trim($part, '#'))); - } else { - $text .= $part; - } - } - - return $text; + return preg_replace_callback('/#([0-9a-f]{2})/i', function ($m) { + return \chr(hexdec($m[1])); + }, $text); } /** diff --git a/tests/PHPUnit/Integration/FontTest.php b/tests/PHPUnit/Integration/FontTest.php index 599a4203..c1aeabba 100644 --- a/tests/PHPUnit/Integration/FontTest.php +++ b/tests/PHPUnit/Integration/FontTest.php @@ -281,12 +281,29 @@ public function testDecodeOctal(): void $this->assertEquals('AB C', Font::decodeOctal('\\101\\102\\040\\103')); $this->assertEquals('AB CD', Font::decodeOctal('\\101\\102\\040\\103D')); $this->assertEquals('AB \199', Font::decodeOctal('\\101\\102\\040\\\\199')); + + // Test that series of backslashes of arbitrary length are decoded properly + $this->assertEquals('-', Font::decodeOctal('\\055')); // \055 + $this->assertEquals('\\055', Font::decodeOctal('\\\\055')); // \\055 + $this->assertEquals('\\-', Font::decodeOctal('\\\\\\055')); // \\\055 + $this->assertEquals('\\\\055', Font::decodeOctal('\\\\\\\\055')); // \\\\055 + $this->assertEquals('\\\\-', Font::decodeOctal('\\\\\\\\\\055')); // \\\\\055 + $this->assertEquals('\\\\\\055', Font::decodeOctal('\\\\\\\\\\\\055')); // \\\\\\055 + $this->assertEquals('\\\\\\-', Font::decodeOctal('\\\\\\\\\\\\\\055')); // \\\\\\\055 + + // Make sure we're unescaping ( and ) before returning the escaped + // backslashes to the string + $this->assertEquals('\\(', Font::decodeOctal('\\\\(')); // \\( - nothing to unescape + $this->assertEquals('\\(', Font::decodeOctal('\\\\\\(')); // \\\( - parenthesis unescaped + $this->assertEquals('\\\\(', Font::decodeOctal('\\\\\\\\(')); // \\\\( - nothing to unescape + $this->assertEquals('\\\\(', Font::decodeOctal('\\\\\\\\\\(')); // \\\\\( - parenthesis unescaped } public function testDecodeEntities(): void { $this->assertEquals('File Type', Font::decodeEntities('File#20Type')); $this->assertEquals('File# Ty#pe', Font::decodeEntities('File##20Ty#pe')); + $this->assertEquals('Fi#le#-Ty#p#e ', Font::decodeEntities('Fi#23le##2DTy#p#e ')); } public function testDecodeUnicode(): void