diff --git a/src/Smalot/PdfParser/Font.php b/src/Smalot/PdfParser/Font.php index dbea6033..b6207bb8 100644 --- a/src/Smalot/PdfParser/Font.php +++ b/src/Smalot/PdfParser/Font.php @@ -349,18 +349,20 @@ public static function decodeHexadecimal(string $hexa, bool $add_braces = false) */ public static function decodeOctal(string $text): string { - $parts = preg_split('/(? '[**pdfparserdblslsh**]']); - foreach ($parts as $part) { - if (preg_match('/^\\\\[0-7]{1,3}$/', $part)) { - $text .= \chr(octdec(trim($part, '\\'))); - } else { - $text .= str_replace(['\\\\', '\\(', '\\)'], ['\\', '(', ')'], $part); - } - } + // Now we can replace all octal codes without worrying about + // escaped backslashes + $text = preg_replace_callback('/\\\\([0-7]{1,3})/', function ($m) { + return \chr(octdec($m[1])); + }, $text); - return $text; + // Unescape any parentheses + $text = str_replace(['\\(', '\\)'], ['(', ')'], $text); + + // Replace instances of the special string with a single backslash + return str_replace('[**pdfparserdblslsh**]', '\\', $text); } /** @@ -368,18 +370,9 @@ public static function decodeOctal(string $text): string */ public static function decodeEntities(string $text): string { - $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE); - $text = ''; - - foreach ($parts as $part) { - if (preg_match('/^#\d{2}$/', $part)) { - $text .= \chr(hexdec(trim($part, '#'))); - } else { - $text .= $part; - } - } - - return $text; + return preg_replace_callback('/#([0-9a-f]{2})/i', function ($m) { + return \chr(hexdec($m[1])); + }, $text); } /** diff --git a/tests/PHPUnit/Integration/FontTest.php b/tests/PHPUnit/Integration/FontTest.php index 599a4203..c1aeabba 100644 --- a/tests/PHPUnit/Integration/FontTest.php +++ b/tests/PHPUnit/Integration/FontTest.php @@ -281,12 +281,29 @@ public function testDecodeOctal(): void $this->assertEquals('AB C', Font::decodeOctal('\\101\\102\\040\\103')); $this->assertEquals('AB CD', Font::decodeOctal('\\101\\102\\040\\103D')); $this->assertEquals('AB \199', Font::decodeOctal('\\101\\102\\040\\\\199')); + + // Test that series of backslashes of arbitrary length are decoded properly + $this->assertEquals('-', Font::decodeOctal('\\055')); // \055 + $this->assertEquals('\\055', Font::decodeOctal('\\\\055')); // \\055 + $this->assertEquals('\\-', Font::decodeOctal('\\\\\\055')); // \\\055 + $this->assertEquals('\\\\055', Font::decodeOctal('\\\\\\\\055')); // \\\\055 + $this->assertEquals('\\\\-', Font::decodeOctal('\\\\\\\\\\055')); // \\\\\055 + $this->assertEquals('\\\\\\055', Font::decodeOctal('\\\\\\\\\\\\055')); // \\\\\\055 + $this->assertEquals('\\\\\\-', Font::decodeOctal('\\\\\\\\\\\\\\055')); // \\\\\\\055 + + // Make sure we're unescaping ( and ) before returning the escaped + // backslashes to the string + $this->assertEquals('\\(', Font::decodeOctal('\\\\(')); // \\( - nothing to unescape + $this->assertEquals('\\(', Font::decodeOctal('\\\\\\(')); // \\\( - parenthesis unescaped + $this->assertEquals('\\\\(', Font::decodeOctal('\\\\\\\\(')); // \\\\( - nothing to unescape + $this->assertEquals('\\\\(', Font::decodeOctal('\\\\\\\\\\(')); // \\\\\( - parenthesis unescaped } public function testDecodeEntities(): void { $this->assertEquals('File Type', Font::decodeEntities('File#20Type')); $this->assertEquals('File# Ty#pe', Font::decodeEntities('File##20Ty#pe')); + $this->assertEquals('Fi#le#-Ty#p#e ', Font::decodeEntities('Fi#23le##2DTy#p#e ')); } public function testDecodeUnicode(): void