From 2964c7d51cbdaa616841c23d03f4a2f9966554b5 Mon Sep 17 00:00:00 2001 From: Danil Alexeev Date: Mon, 28 Aug 2023 13:00:33 +0300 Subject: [PATCH] GDScript: Add raw string literals (r-strings) --- .../gdscript/editor/gdscript_highlighter.cpp | 43 ++- .../gdscript/editor/gdscript_highlighter.h | 1 + modules/gdscript/gdscript_editor.cpp | 1 + modules/gdscript/gdscript_tokenizer.cpp | 278 ++++++++++-------- .../scripts/parser/errors/bad_r_string_1.gd | 2 + .../scripts/parser/errors/bad_r_string_1.out | 2 + .../scripts/parser/errors/bad_r_string_2.gd | 2 + .../scripts/parser/errors/bad_r_string_2.out | 2 + .../scripts/parser/errors/bad_r_string_3.gd | 3 + .../scripts/parser/errors/bad_r_string_3.out | 2 + .../scripts/parser/features/r_strings.gd | 22 ++ .../scripts/parser/features/r_strings.out | 22 ++ modules/regex/doc_classes/RegEx.xml | 2 +- 13 files changed, 250 insertions(+), 132 deletions(-) create mode 100644 modules/gdscript/tests/scripts/parser/errors/bad_r_string_1.gd create mode 100644 modules/gdscript/tests/scripts/parser/errors/bad_r_string_1.out create mode 100644 modules/gdscript/tests/scripts/parser/errors/bad_r_string_2.gd create mode 100644 modules/gdscript/tests/scripts/parser/errors/bad_r_string_2.out create mode 100644 modules/gdscript/tests/scripts/parser/errors/bad_r_string_3.gd create mode 100644 modules/gdscript/tests/scripts/parser/errors/bad_r_string_3.out create mode 100644 modules/gdscript/tests/scripts/parser/features/r_strings.gd create mode 100644 modules/gdscript/tests/scripts/parser/features/r_strings.out diff --git a/modules/gdscript/editor/gdscript_highlighter.cpp b/modules/gdscript/editor/gdscript_highlighter.cpp index e488d6e26681..1be690d894c7 100644 --- a/modules/gdscript/editor/gdscript_highlighter.cpp +++ b/modules/gdscript/editor/gdscript_highlighter.cpp @@ -52,6 +52,7 @@ Dictionary GDScriptSyntaxHighlighter::_get_line_syntax_highlighting_impl(int p_l bool in_keyword = false; bool in_word = false; bool in_number = false; + bool in_raw_string = false; bool in_node_path = false; bool in_node_ref = false; bool in_annotation = false; @@ -234,15 +235,33 @@ Dictionary GDScriptSyntaxHighlighter::_get_line_syntax_highlighting_impl(int p_l } if (str[from] == '\\') { - Dictionary escape_char_highlighter_info; - escape_char_highlighter_info["color"] = symbol_color; - color_map[from] = escape_char_highlighter_info; + if (!in_raw_string) { + Dictionary escape_char_highlighter_info; + escape_char_highlighter_info["color"] = symbol_color; + color_map[from] = escape_char_highlighter_info; + } from++; - Dictionary region_continue_highlighter_info; - region_continue_highlighter_info["color"] = region_color; - color_map[from + 1] = region_continue_highlighter_info; + if (!in_raw_string) { + int esc_len = 0; + if (str[from] == 'u') { + esc_len = 4; + } else if (str[from] == 'U') { + esc_len = 6; + } + for (int k = 0; k < esc_len && from < line_length - 1; k++) { + if (!is_hex_digit(str[from + 1])) { + break; + } + from++; + } + + Dictionary region_continue_highlighter_info; + region_continue_highlighter_info["color"] = region_color; + color_map[from + 1] = region_continue_highlighter_info; + } + continue; } @@ -489,6 +508,12 @@ Dictionary GDScriptSyntaxHighlighter::_get_line_syntax_highlighting_impl(int p_l in_member_variable = false; } + if (!in_raw_string && in_region == -1 && str[j] == 'r' && j < line_length - 1 && (str[j + 1] == '"' || str[j + 1] == '\'')) { + in_raw_string = true; + } else if (in_raw_string && in_region == -1) { + in_raw_string = false; + } + // Keep symbol color for binary '&&'. In the case of '&&&' use StringName color for the last ampersand. if (!in_string_name && in_region == -1 && str[j] == '&' && !is_binary_op) { if (j >= 2 && str[j - 1] == '&' && str[j - 2] != '&' && prev_is_binary_op) { @@ -520,7 +545,9 @@ Dictionary GDScriptSyntaxHighlighter::_get_line_syntax_highlighting_impl(int p_l in_annotation = false; } - if (in_node_ref) { + if (in_raw_string) { + color = string_color; + } else if (in_node_ref) { next_type = NODE_REF; color = node_ref_color; } else if (in_annotation) { @@ -692,7 +719,7 @@ void GDScriptSyntaxHighlighter::_update_cache() { } /* Strings */ - const Color string_color = EDITOR_GET("text_editor/theme/highlighting/string_color"); + string_color = EDITOR_GET("text_editor/theme/highlighting/string_color"); List strings; gdscript->get_string_delimiters(&strings); for (const String &string : strings) { diff --git a/modules/gdscript/editor/gdscript_highlighter.h b/modules/gdscript/editor/gdscript_highlighter.h index fe3b63d713f5..090857f397d8 100644 --- a/modules/gdscript/editor/gdscript_highlighter.h +++ b/modules/gdscript/editor/gdscript_highlighter.h @@ -78,6 +78,7 @@ class GDScriptSyntaxHighlighter : public EditorSyntaxHighlighter { Color built_in_type_color; Color number_color; Color member_color; + Color string_color; Color node_path_color; Color node_ref_color; Color annotation_color; diff --git a/modules/gdscript/gdscript_editor.cpp b/modules/gdscript/gdscript_editor.cpp index 6cad3b2b90ba..849a4b5cd0f6 100644 --- a/modules/gdscript/gdscript_editor.cpp +++ b/modules/gdscript/gdscript_editor.cpp @@ -59,6 +59,7 @@ void GDScriptLanguage::get_string_delimiters(List *p_delimiters) const { p_delimiters->push_back("' '"); p_delimiters->push_back("\"\"\" \"\"\""); p_delimiters->push_back("''' '''"); + // NOTE: StringName, NodePath and r-strings are not listed here. } bool GDScriptLanguage::is_using_templates() { diff --git a/modules/gdscript/gdscript_tokenizer.cpp b/modules/gdscript/gdscript_tokenizer.cpp index 42b983ef458c..d5c02eeae15e 100644 --- a/modules/gdscript/gdscript_tokenizer.cpp +++ b/modules/gdscript/gdscript_tokenizer.cpp @@ -857,10 +857,14 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() { STRING_NODEPATH, }; + bool is_raw = false; bool is_multiline = false; StringType type = STRING_REGULAR; - if (_peek(-1) == '&') { + if (_peek(-1) == 'r') { + is_raw = true; + _advance(); + } else if (_peek(-1) == '&') { type = STRING_NAME; _advance(); } else if (_peek(-1) == '^') { @@ -890,7 +894,12 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() { char32_t ch = _peek(); if (ch == 0x200E || ch == 0x200F || (ch >= 0x202A && ch <= 0x202E) || (ch >= 0x2066 && ch <= 0x2069)) { - Token error = make_error("Invisible text direction control character present in the string, escape it (\"\\u" + String::num_int64(ch, 16) + "\") to avoid confusion."); + Token error; + if (is_raw) { + error = make_error("Invisible text direction control character present in the string, use regular string literal instead of r-string."); + } else { + error = make_error("Invisible text direction control character present in the string, escape it (\"\\u" + String::num_int64(ch, 16) + "\") to avoid confusion."); + } error.start_column = column; error.leftmost_column = error.start_column; error.end_column = column + 1; @@ -905,144 +914,164 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() { return make_error("Unterminated string."); } - // Grab escape character. - char32_t code = _peek(); - _advance(); - if (_is_at_end()) { - return make_error("Unterminated string."); - } + if (is_raw) { + if (_peek() == quote_char) { + _advance(); + if (_is_at_end()) { + return make_error("Unterminated string."); + } + result += '\\'; + result += quote_char; + } else if (_peek() == '\\') { // For `\\\"`. + _advance(); + if (_is_at_end()) { + return make_error("Unterminated string."); + } + result += '\\'; + result += '\\'; + } else { + result += '\\'; + } + } else { + // Grab escape character. + char32_t code = _peek(); + _advance(); + if (_is_at_end()) { + return make_error("Unterminated string."); + } - char32_t escaped = 0; - bool valid_escape = true; + char32_t escaped = 0; + bool valid_escape = true; - switch (code) { - case 'a': - escaped = '\a'; - break; - case 'b': - escaped = '\b'; - break; - case 'f': - escaped = '\f'; - break; - case 'n': - escaped = '\n'; - break; - case 'r': - escaped = '\r'; - break; - case 't': - escaped = '\t'; - break; - case 'v': - escaped = '\v'; - break; - case '\'': - escaped = '\''; - break; - case '\"': - escaped = '\"'; - break; - case '\\': - escaped = '\\'; - break; - case 'U': - case 'u': { - // Hexadecimal sequence. - int hex_len = (code == 'U') ? 6 : 4; - for (int j = 0; j < hex_len; j++) { - if (_is_at_end()) { - return make_error("Unterminated string."); + switch (code) { + case 'a': + escaped = '\a'; + break; + case 'b': + escaped = '\b'; + break; + case 'f': + escaped = '\f'; + break; + case 'n': + escaped = '\n'; + break; + case 'r': + escaped = '\r'; + break; + case 't': + escaped = '\t'; + break; + case 'v': + escaped = '\v'; + break; + case '\'': + escaped = '\''; + break; + case '\"': + escaped = '\"'; + break; + case '\\': + escaped = '\\'; + break; + case 'U': + case 'u': { + // Hexadecimal sequence. + int hex_len = (code == 'U') ? 6 : 4; + for (int j = 0; j < hex_len; j++) { + if (_is_at_end()) { + return make_error("Unterminated string."); + } + + char32_t digit = _peek(); + char32_t value = 0; + if (is_digit(digit)) { + value = digit - '0'; + } else if (digit >= 'a' && digit <= 'f') { + value = digit - 'a'; + value += 10; + } else if (digit >= 'A' && digit <= 'F') { + value = digit - 'A'; + value += 10; + } else { + // Make error, but keep parsing the string. + Token error = make_error("Invalid hexadecimal digit in unicode escape sequence."); + error.start_column = column; + error.leftmost_column = error.start_column; + error.end_column = column + 1; + error.rightmost_column = error.end_column; + push_error(error); + valid_escape = false; + break; + } + + escaped <<= 4; + escaped |= value; + + _advance(); } - - char32_t digit = _peek(); - char32_t value = 0; - if (is_digit(digit)) { - value = digit - '0'; - } else if (digit >= 'a' && digit <= 'f') { - value = digit - 'a'; - value += 10; - } else if (digit >= 'A' && digit <= 'F') { - value = digit - 'A'; - value += 10; - } else { - // Make error, but keep parsing the string. - Token error = make_error("Invalid hexadecimal digit in unicode escape sequence."); - error.start_column = column; - error.leftmost_column = error.start_column; - error.end_column = column + 1; - error.rightmost_column = error.end_column; - push_error(error); - valid_escape = false; + } break; + case '\r': + if (_peek() != '\n') { + // Carriage return without newline in string. (???) + // Just add it to the string and keep going. + result += ch; + _advance(); break; } - - escaped <<= 4; - escaped |= value; - - _advance(); - } - } break; - case '\r': - if (_peek() != '\n') { - // Carriage return without newline in string. (???) - // Just add it to the string and keep going. - result += ch; - _advance(); + [[fallthrough]]; + case '\n': + // Escaping newline. + newline(false); + valid_escape = false; // Don't add to the string. break; - } - [[fallthrough]]; - case '\n': - // Escaping newline. - newline(false); - valid_escape = false; // Don't add to the string. - break; - default: - Token error = make_error("Invalid escape in string."); - error.start_column = column - 2; - error.leftmost_column = error.start_column; - push_error(error); - valid_escape = false; - break; - } - // Parse UTF-16 pair. - if (valid_escape) { - if ((escaped & 0xfffffc00) == 0xd800) { - if (prev == 0) { - prev = escaped; - prev_pos = column - 2; - continue; - } else { - Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate"); + default: + Token error = make_error("Invalid escape in string."); error.start_column = column - 2; error.leftmost_column = error.start_column; push_error(error); valid_escape = false; - prev = 0; + break; + } + // Parse UTF-16 pair. + if (valid_escape) { + if ((escaped & 0xfffffc00) == 0xd800) { + if (prev == 0) { + prev = escaped; + prev_pos = column - 2; + continue; + } else { + Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate."); + error.start_column = column - 2; + error.leftmost_column = error.start_column; + push_error(error); + valid_escape = false; + prev = 0; + } + } else if ((escaped & 0xfffffc00) == 0xdc00) { + if (prev == 0) { + Token error = make_error("Invalid UTF-16 sequence in string, unpaired trail surrogate."); + error.start_column = column - 2; + error.leftmost_column = error.start_column; + push_error(error); + valid_escape = false; + } else { + escaped = (prev << 10UL) + escaped - ((0xd800 << 10UL) + 0xdc00 - 0x10000); + prev = 0; + } } - } else if ((escaped & 0xfffffc00) == 0xdc00) { - if (prev == 0) { - Token error = make_error("Invalid UTF-16 sequence in string, unpaired trail surrogate"); - error.start_column = column - 2; + if (prev != 0) { + Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate."); + error.start_column = prev_pos; error.leftmost_column = error.start_column; push_error(error); - valid_escape = false; - } else { - escaped = (prev << 10UL) + escaped - ((0xd800 << 10UL) + 0xdc00 - 0x10000); prev = 0; } } - if (prev != 0) { - Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate"); - error.start_column = prev_pos; - error.leftmost_column = error.start_column; - push_error(error); - prev = 0; - } - } - if (valid_escape) { - result += escaped; + if (valid_escape) { + result += escaped; + } } } else if (ch == quote_char) { if (prev != 0) { @@ -1416,6 +1445,9 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() { if (is_digit(c)) { return number(); + } else if (c == 'r' && (_peek() == '"' || _peek() == '\'')) { + // Raw string literals. + return string(); } else if (is_unicode_identifier_start(c)) { return potential_identifier(); } diff --git a/modules/gdscript/tests/scripts/parser/errors/bad_r_string_1.gd b/modules/gdscript/tests/scripts/parser/errors/bad_r_string_1.gd new file mode 100644 index 000000000000..e5eecbb81957 --- /dev/null +++ b/modules/gdscript/tests/scripts/parser/errors/bad_r_string_1.gd @@ -0,0 +1,2 @@ +func test(): + print(r"\") diff --git a/modules/gdscript/tests/scripts/parser/errors/bad_r_string_1.out b/modules/gdscript/tests/scripts/parser/errors/bad_r_string_1.out new file mode 100644 index 000000000000..c8e843b0d7a7 --- /dev/null +++ b/modules/gdscript/tests/scripts/parser/errors/bad_r_string_1.out @@ -0,0 +1,2 @@ +GDTEST_PARSER_ERROR +Unterminated string. diff --git a/modules/gdscript/tests/scripts/parser/errors/bad_r_string_2.gd b/modules/gdscript/tests/scripts/parser/errors/bad_r_string_2.gd new file mode 100644 index 000000000000..9168b69f86b1 --- /dev/null +++ b/modules/gdscript/tests/scripts/parser/errors/bad_r_string_2.gd @@ -0,0 +1,2 @@ +func test(): + print(r"\\"") diff --git a/modules/gdscript/tests/scripts/parser/errors/bad_r_string_2.out b/modules/gdscript/tests/scripts/parser/errors/bad_r_string_2.out new file mode 100644 index 000000000000..c8e843b0d7a7 --- /dev/null +++ b/modules/gdscript/tests/scripts/parser/errors/bad_r_string_2.out @@ -0,0 +1,2 @@ +GDTEST_PARSER_ERROR +Unterminated string. diff --git a/modules/gdscript/tests/scripts/parser/errors/bad_r_string_3.gd b/modules/gdscript/tests/scripts/parser/errors/bad_r_string_3.gd new file mode 100644 index 000000000000..37dc910e5f06 --- /dev/null +++ b/modules/gdscript/tests/scripts/parser/errors/bad_r_string_3.gd @@ -0,0 +1,3 @@ +func test(): + # v + print(r"['"]*") diff --git a/modules/gdscript/tests/scripts/parser/errors/bad_r_string_3.out b/modules/gdscript/tests/scripts/parser/errors/bad_r_string_3.out new file mode 100644 index 000000000000..dcb5c2f2899e --- /dev/null +++ b/modules/gdscript/tests/scripts/parser/errors/bad_r_string_3.out @@ -0,0 +1,2 @@ +GDTEST_PARSER_ERROR +Closing "]" doesn't have an opening counterpart. diff --git a/modules/gdscript/tests/scripts/parser/features/r_strings.gd b/modules/gdscript/tests/scripts/parser/features/r_strings.gd new file mode 100644 index 000000000000..6f546f28be0c --- /dev/null +++ b/modules/gdscript/tests/scripts/parser/features/r_strings.gd @@ -0,0 +1,22 @@ +func test(): + print(r"test ' \' \" \\ \n \t \u2023 test") + print(r"\n\\[\t ]*(\w+)") + print(r"") + print(r"\"") + print(r"\\\"") + print(r"\\") + print(r"\" \\\" \\\\\"") + print(r"\ \\ \\\ \\\\ \\\\\ \\") + print(r'"') + print(r'"(?:\\.|[^"])*"') + print(r"""""") + print(r"""test \t "test"="" " \" \\\" \ \\ \\\ test""") + print(r'''r"""test \t "test"="" " \" \\\" \ \\ \\\ test"""''') + print(r"\t + \t") + print(r"\t \ + \t") + print(r"""\t + \t""") + print(r"""\t \ + \t""") diff --git a/modules/gdscript/tests/scripts/parser/features/r_strings.out b/modules/gdscript/tests/scripts/parser/features/r_strings.out new file mode 100644 index 000000000000..114ef0a6c3e8 --- /dev/null +++ b/modules/gdscript/tests/scripts/parser/features/r_strings.out @@ -0,0 +1,22 @@ +GDTEST_OK +test ' \' \" \\ \n \t \u2023 test +\n\\[\t ]*(\w+) + +\" +\\\" +\\ +\" \\\" \\\\\" +\ \\ \\\ \\\\ \\\\\ \\ +" +"(?:\\.|[^"])*" + +test \t "test"="" " \" \\\" \ \\ \\\ test +r"""test \t "test"="" " \" \\\" \ \\ \\\ test""" +\t + \t +\t \ + \t +\t + \t +\t \ + \t diff --git a/modules/regex/doc_classes/RegEx.xml b/modules/regex/doc_classes/RegEx.xml index 5770e7155e88..ab74fce3a935 100644 --- a/modules/regex/doc_classes/RegEx.xml +++ b/modules/regex/doc_classes/RegEx.xml @@ -10,7 +10,7 @@ var regex = RegEx.new() regex.compile("\\w-(\\d+)") [/codeblock] - The search pattern must be escaped first for GDScript before it is escaped for the expression. For example, [code]compile("\\d+")[/code] would be read by RegEx as [code]\d+[/code]. Similarly, [code]compile("\"(?:\\\\.|[^\"])*\"")[/code] would be read as [code]"(?:\\.|[^"])*"[/code]. + The search pattern must be escaped first for GDScript before it is escaped for the expression. For example, [code]compile("\\d+")[/code] would be read by RegEx as [code]\d+[/code]. Similarly, [code]compile("\"(?:\\\\.|[^\"])*\"")[/code] would be read as [code]"(?:\\.|[^"])*"[/code]. In GDScript, you can also use raw string literals (r-strings). For example, [code]compile(r'"(?:\\.|[^"])*"')[/code] would be read the same. Using [method search], you can find the pattern within the given text. If a pattern is found, [RegExMatch] is returned and you can retrieve details of the results using methods such as [method RegExMatch.get_string] and [method RegExMatch.get_start]. [codeblock] var regex = RegEx.new()