From 078246aa41514465a08a7636aff8f64ab9f1cf25 Mon Sep 17 00:00:00 2001 From: Jonas Raoni Soares da Silva Date: Fri, 22 Jul 2022 19:05:32 +0300 Subject: [PATCH] #282 Added support for extra escaping sequences --- src/Loader/StrictPoLoader.php | 58 +++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/src/Loader/StrictPoLoader.php b/src/Loader/StrictPoLoader.php index 07d2e4c..6151ca1 100644 --- a/src/Loader/StrictPoLoader.php +++ b/src/Loader/StrictPoLoader.php @@ -8,7 +8,7 @@ use Gettext\Translations; /** - * Class to load a PO file following the same rules of the GNU tools. + * Class to load a PO file following the same rules of the GNU gettext tools. */ final class StrictPoLoader extends Loader { @@ -139,7 +139,19 @@ private function readNumber(): string } /** - * Attempts to read a standard comment string which ends on \n + * Read at least one character from the given character set + */ + private function readCharset(string $charset, int $maxLength): string + { + for ($data = ''; ($char = $this->getChar()) !== null && is_int(strpos($charset, $char)) && --$maxLength >= 0; $data .= $this->nextChar()); + if ($data === '') { + throw new Exception("Expected at least one occurrence of the characters \"{$charset}\" at byte {$this->position}"); + } + return $data; + } + + /** + * Attempts to read a standard comment string which ends on \n or \r */ private function readCommentString(): string { @@ -163,7 +175,9 @@ private function readQuotedString(): string 't' => "\t", 'v' => "\x0b", '"' => '"', - ]; + ], + $octalDigits = '01234567', + $hexDigits = $octalDigits . '89abcdefABCDEF'; $hasData = false; for ($data = '';;) { if (!$this->readChar('"')) { @@ -176,15 +190,35 @@ private function readQuotedString(): string // Collects chars until the end of the data/file for (; ($char = $this->getChar() ?? '"') !== '"'; $data .= $char) { $this->nextChar(); - if ($char === '\\') { - // Ensures the next char is a valid escape character - if (($char = $aliases[$this->nextChar()] ?? null) === null) { - throw new Exception("Invalid quoted character at byte {$this->position}"); - } - continue; - } - if ($char === "\n" || $char === "\r") { - throw new Exception("New line character must be encoded at byte {$this->position}"); + $octalSequence = ctype_digit($char) ? $char : ''; + switch ($char) { + case '\\': + // Ensures the next char is a valid escape character + if (($char = $aliases[$this->nextChar()] ?? null) === null) { + throw new Exception("Invalid quoted character at byte {$this->position}"); + } + break; + case "\n": + case "\r": + throw new Exception("New line character must be encoded at byte {$this->position}"); + case 'U': + case 'u': + // The GNU gettext is supposed to follow the escaping sequences of C, but curiously it doesn't support the u/U + $data = $this->readCharset($hexDigits, $char === 'u' ? 4 : 8); + $data = str_pad($data, strlen($data) + (strlen($data) & 1), '0', STR_PAD_LEFT); + $char = json_decode("\"\\u{$data}\""); + break; + case $octalSequence: + $data = $this->readCharset($octalDigits, 3); + // GNU gettext fails with octals above the signed char range + if (($decimal = octdec($data)) > 127) { + throw new Exception("Octal value out of range [0, 0177] at byte {$this->position}"); + } + $char = chr($decimal); + case 'x': + $data = $this->readCharset($hexDigits, PHP_INT_MAX); + // GNU reads all valid hexadecimal chars, but only uses the last pair + $char = hexdec(substr($data, -2)); } } if (!$this->readChar('"')) {