From bd64a60e0a7b068c5a1544c503a147fd8acd15e5 Mon Sep 17 00:00:00 2001 From: springcomp Date: Tue, 18 Oct 2022 17:01:54 +0200 Subject: [PATCH] [unicode] `reverse()` learned not to break composite character sequences. --- .../Functions/Impl/TextExtensions.cs | 17 +++++++++++++++++ src/jmespath.net/Functions/ReverseFunction.cs | 5 +++-- src/jmespath.net/Utils/Text.cs | 19 +++++++++++++++++++ .../Utils/StringFunctionsTest.cs | 2 ++ tests/jmespathnet.tests/Utils/TextTest.cs | 2 ++ 5 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 src/jmespath.net/Functions/Impl/TextExtensions.cs diff --git a/src/jmespath.net/Functions/Impl/TextExtensions.cs b/src/jmespath.net/Functions/Impl/TextExtensions.cs new file mode 100644 index 0000000..8eab74a --- /dev/null +++ b/src/jmespath.net/Functions/Impl/TextExtensions.cs @@ -0,0 +1,17 @@ +using DevLab.JmesPath.Utils; +using System; +using System.Linq; + +namespace jmespath.net.Functions.Impl +{ + internal static class TextExtensions + { + /// + /// Supports the `reverse()` the function. + /// + /// + /// + public static Text Invert(this Text text) + => (Text)String.Join( "", text.ToArray().Reverse() ); + } +} diff --git a/src/jmespath.net/Functions/ReverseFunction.cs b/src/jmespath.net/Functions/ReverseFunction.cs index 040008a..1d5d712 100644 --- a/src/jmespath.net/Functions/ReverseFunction.cs +++ b/src/jmespath.net/Functions/ReverseFunction.cs @@ -1,6 +1,7 @@ using System; using System.Linq; using DevLab.JmesPath.Utils; +using jmespath.net.Functions.Impl; using Newtonsoft.Json.Linq; namespace DevLab.JmesPath.Functions @@ -30,8 +31,8 @@ public override JToken Execute(params JmesPathFunctionArgument[] args) case "string": { var text = (Text)token.Value(); - var reversed = new Text(text.CodePoints.Reverse().ToArray()); - return new JValue((string)reversed); + var reversed = text.Invert(); + return new JValue((string) reversed); } case "array": { diff --git a/src/jmespath.net/Utils/Text.cs b/src/jmespath.net/Utils/Text.cs index 78f0fb9..d351755 100644 --- a/src/jmespath.net/Utils/Text.cs +++ b/src/jmespath.net/Utils/Text.cs @@ -11,6 +11,25 @@ namespace DevLab.JmesPath.Utils /// The class represents a sequence of Unicode codepoints. /// If differs from the .NET class in that is correctly /// handles codepoints from supplementary planes, including surrogate pairs. + /// + /// characters in a .NET string are represented by a 21-bit code value + /// of a character in the Basic Multilingual Plane (0x0000 - 0x10FFFF) + /// it can consists of either: + /// + /// - a single 16-bit codepoint (U+0000 to U+FFFF excluding the surrogate range U+D800 to U+DFFF). + /// In that case, a Unicode character, identified by its codepoint, maps to a single 16-bit code unit. + /// + /// or + /// + /// - a pair of 16-bit surrogate code units (high U+D800 to U+DBFF and low U+DC00 to U+DFFF). + /// In that case, a Unicode character, identifier by its codepoint, maps to a sequence of two 16-bit code units. + /// + /// Additionally, some Unicode characters can have multiple representations. For instance, the + /// character 'é' can be encoded using the following two difference sequences of codepoints: + /// + /// - 'é' U+00E9 LATIN SMALL LETTER E WITH ACUTE ACCENT. + /// - 'è' U+0065 LATIN SMALL LETTER E, U+0301 COMBINING ACUTE ACCENT. + /// /// internal sealed partial class Text : IEnumerable, IEquatable, IComparable { diff --git a/tests/jmespathnet.tests/Utils/StringFunctionsTest.cs b/tests/jmespathnet.tests/Utils/StringFunctionsTest.cs index 392a697..482e16f 100644 --- a/tests/jmespathnet.tests/Utils/StringFunctionsTest.cs +++ b/tests/jmespathnet.tests/Utils/StringFunctionsTest.cs @@ -56,6 +56,8 @@ JToken MakeObject(string text) [Theory] [InlineData("a𝌆b", "b𝌆a")] + [InlineData("aéb", "béa")] + [InlineData("ae\x0301b", "béa")] public void Reverse(string text, string expected) { diff --git a/tests/jmespathnet.tests/Utils/TextTest.cs b/tests/jmespathnet.tests/Utils/TextTest.cs index 58a0de0..e2889fb 100644 --- a/tests/jmespathnet.tests/Utils/TextTest.cs +++ b/tests/jmespathnet.tests/Utils/TextTest.cs @@ -10,6 +10,8 @@ public sealed class TextTest [Theory] [InlineData("𝌆", 1, "U+1D306 TETRAGRAM FOR CENTER")] [InlineData("😀", 1, "U+1F600 GRINNING FACE")] + [InlineData("é", 1, "U+00E9 LATIN SMALL LETTER E WITH ACUTE ACCENT")] + [InlineData("è", 1, "U+0065 LATIN SMALL LETTER E, U+0301 COMBINING ACUTE ACCENT")] public void Length(string text, int expected, string name) { var t = new Text(text);