Skip to content

Commit

Permalink
Extract StringUnescaper from ConstExprParser
Browse files Browse the repository at this point in the history
  • Loading branch information
ondrejmirtes committed Apr 17, 2023
1 parent 5e2f2e0 commit 376023a
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 92 deletions.
13 changes: 8 additions & 5 deletions src/Ast/ConstExpr/QuoteAwareConstExprStringNode.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
use function sprintf;
use function str_pad;
use function strlen;
use const STR_PAD_LEFT;

class QuoteAwareConstExprStringNode implements ConstExprNode
{
Expand Down Expand Up @@ -47,9 +48,10 @@ public function __toString(): string
return sprintf('"%s"', $this->escapeDoubleQuotedString());
}

private function escapeDoubleQuotedString() {
private function escapeDoubleQuotedString()
{
$quote = '"';
$escaped = addcslashes($this->value, "\n\r\t\f\v$" . $quote . "\\");
$escaped = addcslashes($this->value, "\n\r\t\f\v$" . $quote . '\\');

// Escape control characters and non-UTF-8 characters.
// Regex based on https://stackoverflow.com/a/11709412/385378.
Expand All @@ -68,10 +70,11 @@ private function escapeDoubleQuotedString() {
| (?<=[\xF0-\xF4])[\x80-\xBF](?![\x80-\xBF]{2}) # Short 4 byte sequence
| (?<=[\xF0-\xF4][\x80-\xBF])[\x80-\xBF](?![\x80-\xBF]) # Short 4 byte sequence (2)
)/x';
return preg_replace_callback($regex, function ($matches) {
return preg_replace_callback($regex, static function ($matches) {
assert(strlen($matches[0]) === 1);
$hex = dechex(ord($matches[0]));;
return '\\x' . str_pad($hex, 2, '0', \STR_PAD_LEFT);
$hex = dechex(ord($matches[0]));

return '\\x' . str_pad($hex, 2, '0', STR_PAD_LEFT);
}, $escaped);
}

Expand Down
88 changes: 1 addition & 87 deletions src/Parser/ConstExprParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,12 @@

use PHPStan\PhpDocParser\Ast;
use PHPStan\PhpDocParser\Lexer\Lexer;
use function chr;
use function hexdec;
use function octdec;
use function preg_replace_callback;
use function str_replace;
use function strtolower;
use function substr;

class ConstExprParser
{

private const REPLACEMENTS = [
'\\' => '\\',
'n' => "\n",
'r' => "\r",
't' => "\t",
'f' => "\f",
'v' => "\v",
'e' => "\x1B",
];

/** @var bool */
private $unescapeStrings;

Expand Down Expand Up @@ -56,7 +41,7 @@ public function parse(TokenIterator $tokens, bool $trimStrings = false): Ast\Con
$type = $tokens->currentTokenType();
if ($trimStrings) {
if ($this->unescapeStrings) {
$value = self::unescapeString($value);
$value = StringUnescaper::unescapeString($value);
} else {
$value = substr($value, 1, -1);
}
Expand Down Expand Up @@ -171,75 +156,4 @@ private function parseArrayItem(TokenIterator $tokens): Ast\ConstExpr\ConstExprA
return new Ast\ConstExpr\ConstExprArrayItemNode($key, $value);
}

private static function unescapeString(string $string): string
{
$quote = $string[0];

if ($quote === '\'') {
return str_replace(
['\\\\', '\\\''],
['\\', '\''],
substr($string, 1, -1)
);
}

return self::parseEscapeSequences(substr($string, 1, -1), '"');
}

/**
* Implementation based on https://github.com/nikic/PHP-Parser/blob/b0edd4c41111042d43bb45c6c657b2e0db367d9e/lib/PhpParser/Node/Scalar/String_.php#L90-L130
*/
private static function parseEscapeSequences(string $str, string $quote): string
{
$str = str_replace('\\' . $quote, $quote, $str);

return preg_replace_callback(
'~\\\\([\\\\nrtfve]|[xX][0-9a-fA-F]{1,2}|[0-7]{1,3}|u\{([0-9a-fA-F]+)\})~',
static function ($matches) {
$str = $matches[1];

if (isset(self::REPLACEMENTS[$str])) {
return self::REPLACEMENTS[$str];
}
if ($str[0] === 'x' || $str[0] === 'X') {
return chr(hexdec(substr($str, 1)));
}
if ($str[0] === 'u') {
return self::codePointToUtf8(hexdec($matches[2]));
}

return chr(octdec($str));
},
$str
);
}

/**
* Implementation based on https://github.com/nikic/PHP-Parser/blob/b0edd4c41111042d43bb45c6c657b2e0db367d9e/lib/PhpParser/Node/Scalar/String_.php#L132-L154
*/
private static function codePointToUtf8(int $num): string
{
if ($num <= 0x7F) {
return chr($num);
}
if ($num <= 0x7FF) {
return chr(($num >> 6) + 0xC0)
. chr(($num & 0x3F) + 0x80);
}
if ($num <= 0xFFFF) {
return chr(($num >> 12) + 0xE0)
. chr((($num >> 6) & 0x3F) + 0x80)
. chr(($num & 0x3F) + 0x80);
}
if ($num <= 0x1FFFFF) {
return chr(($num >> 18) + 0xF0)
. chr((($num >> 12) & 0x3F) + 0x80)
. chr((($num >> 6) & 0x3F) + 0x80)
. chr(($num & 0x3F) + 0x80);
}

// Invalid UTF-8 codepoint escape sequence: Codepoint too large
return "\xef\xbf\xbd";
}

}
96 changes: 96 additions & 0 deletions src/Parser/StringUnescaper.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
<?php declare(strict_types = 1);

namespace PHPStan\PhpDocParser\Parser;

use function chr;
use function hexdec;
use function octdec;
use function preg_replace_callback;
use function str_replace;
use function substr;

class StringUnescaper
{

private const REPLACEMENTS = [
'\\' => '\\',
'n' => "\n",
'r' => "\r",
't' => "\t",
'f' => "\f",
'v' => "\v",
'e' => "\x1B",
];

public static function unescapeString(string $string): string
{
$quote = $string[0];

if ($quote === '\'') {
return str_replace(
['\\\\', '\\\''],
['\\', '\''],
substr($string, 1, -1)
);
}

return self::parseEscapeSequences(substr($string, 1, -1), '"');
}

/**
* Implementation based on https://github.com/nikic/PHP-Parser/blob/b0edd4c41111042d43bb45c6c657b2e0db367d9e/lib/PhpParser/Node/Scalar/String_.php#L90-L130
*/
private static function parseEscapeSequences(string $str, string $quote): string
{
$str = str_replace('\\' . $quote, $quote, $str);

return preg_replace_callback(
'~\\\\([\\\\nrtfve]|[xX][0-9a-fA-F]{1,2}|[0-7]{1,3}|u\{([0-9a-fA-F]+)\})~',
static function ($matches) {
$str = $matches[1];

if (isset(self::REPLACEMENTS[$str])) {
return self::REPLACEMENTS[$str];
}
if ($str[0] === 'x' || $str[0] === 'X') {
return chr(hexdec(substr($str, 1)));
}
if ($str[0] === 'u') {
return self::codePointToUtf8(hexdec($matches[2]));
}

return chr(octdec($str));
},
$str
);
}

/**
* Implementation based on https://github.com/nikic/PHP-Parser/blob/b0edd4c41111042d43bb45c6c657b2e0db367d9e/lib/PhpParser/Node/Scalar/String_.php#L132-L154
*/
private static function codePointToUtf8(int $num): string
{
if ($num <= 0x7F) {
return chr($num);
}
if ($num <= 0x7FF) {
return chr(($num >> 6) + 0xC0)
. chr(($num & 0x3F) + 0x80);
}
if ($num <= 0xFFFF) {
return chr(($num >> 12) + 0xE0)
. chr((($num >> 6) & 0x3F) + 0x80)
. chr(($num & 0x3F) + 0x80);
}
if ($num <= 0x1FFFFF) {
return chr(($num >> 18) + 0xF0)
. chr((($num >> 12) & 0x3F) + 0x80)
. chr((($num >> 6) & 0x3F) + 0x80)
. chr(($num & 0x3F) + 0x80);
}

// Invalid UTF-8 codepoint escape sequence: Codepoint too large
return "\xef\xbf\xbd";
}

}

0 comments on commit 376023a

Please sign in to comment.