From b7b23470bcd7a65aa2d561e2dbe684d94851d138 Mon Sep 17 00:00:00 2001 From: springcomp Date: Tue, 18 Oct 2022 08:54:20 +0200 Subject: [PATCH 1/6] [unicode] The `length()` function learned to correctly handle Unicode surrogate pairs and composite characters. --- src/jmespath.net/Functions/LengthFunction.cs | 2 +- src/jmespath.net/InternalsVisibleTo.cs | 7 ++++ src/jmespath.net/Utils/Text.cs | 41 +++++++++++++++++++ .../Utils/StringFunctionsTest.cs | 33 +++++++++++++++ 4 files changed, 82 insertions(+), 1 deletion(-) create mode 100644 src/jmespath.net/InternalsVisibleTo.cs create mode 100644 src/jmespath.net/Utils/Text.cs create mode 100644 tests/jmespathnet.tests/Utils/StringFunctionsTest.cs diff --git a/src/jmespath.net/Functions/LengthFunction.cs b/src/jmespath.net/Functions/LengthFunction.cs index ff72c35..b7875cb 100644 --- a/src/jmespath.net/Functions/LengthFunction.cs +++ b/src/jmespath.net/Functions/LengthFunction.cs @@ -27,7 +27,7 @@ public override JToken Execute(params JmesPathFunctionArgument[] args) switch (token.GetTokenType()) { case "string": - return token.Value().Length; + return ((Text)token.Value()).Length; case "array": return ((JArray) token).Count; case "object": diff --git a/src/jmespath.net/InternalsVisibleTo.cs b/src/jmespath.net/InternalsVisibleTo.cs new file mode 100644 index 0000000..cf21c9c --- /dev/null +++ b/src/jmespath.net/InternalsVisibleTo.cs @@ -0,0 +1,7 @@ +using System.Runtime.CompilerServices; + +#if DEBUG +[assembly: InternalsVisibleTo("jmespathnet.tests, PublicKey=0024000004800000940000000602000000240000525341310004000001000100055796df0ae0f975fabb3455d92c9edfef1e266fe66273a7f42c298406335fef71fdf99f46033f5f1e890fa2c6a5f230bfdd5832aa16eb45af02ad70ff716f97a51ff955abaaa2490da59ece7f2474dd43695c6bc8f1c82d1bb38f166fdfa7716e11291bda347bc8689d5435e68401a9ab5b4e8e49c1074173d21edf4fbda1b1")] +#else +[assembly: InternalsVisibleTo("jmespathnet.tests, PublicKey=002400000480000094000000060200000024000052534131000400000100010055709b8bb177721db5eb5a9e7437bfa5f46251aef5dcf91f4a36a7dcb98e51a8ecf5a37284004fa6694f3471f5dfc82244c9672eb085cd65c7cb75d8251aa971a349d4641b492ca0963b74fd9878a5872d6ccbb7b7ceff82aa3687c240a70b4d5565c7cff5df0a12cdbde58e937320fb302b7ccedff72008f3bec0bee8384dc5")] +#endif \ No newline at end of file diff --git a/src/jmespath.net/Utils/Text.cs b/src/jmespath.net/Utils/Text.cs new file mode 100644 index 0000000..55d9d79 --- /dev/null +++ b/src/jmespath.net/Utils/Text.cs @@ -0,0 +1,41 @@ +using System; +using System.Globalization; + +namespace DevLab.JmesPath.Utils +{ + /// + /// The class represents a sequence of Unicode codepoints. + /// If differs from the .NET class in that is correctly + /// handles codepoints from supplementary planes, including surrogate pairs. + /// + internal class Text + { + private readonly string text_; + private readonly StringInfo info_; + + /// + /// Initialize a new instance of the class. + /// + /// + public Text(string text) + { + text_ = text; + info_ = new StringInfo(text_); + } + + /// + /// The number of Unicode codepoints. + /// + public int Length + => info_.LengthInTextElements; + + public static implicit operator String(Text text) + => text.ToString(); + + public static explicit operator Text(string text) + => new Text(text); + + public override string ToString() + => text_; + } +} \ No newline at end of file diff --git a/tests/jmespathnet.tests/Utils/StringFunctionsTest.cs b/tests/jmespathnet.tests/Utils/StringFunctionsTest.cs new file mode 100644 index 0000000..a250169 --- /dev/null +++ b/tests/jmespathnet.tests/Utils/StringFunctionsTest.cs @@ -0,0 +1,33 @@ +using DevLab.JmesPath.Functions; +using DevLab.JmesPath.Utils; +using Newtonsoft.Json.Linq; +using Xunit; + +namespace jmespath.net.tests.Utils +{ + public sealed class TextTest + { + [Theory] + [InlineData("𝌆", 1, "U+1D306 TETRAGRAM FOR CENTER")] + [InlineData("😀", 1, "U+1F600 GRINNING FACE")] + public void Length(string text, int expected, string name) + { + var t = new Text(text); + Assert.Equal(expected, t.Length); + } + } + + public sealed class StringFunctionsTest + { + [Theory] + [InlineData("𝌆", 1, "U+1D306 TETRAGRAM FOR CENTER")] + [InlineData("😀", 1, "U+1F600 GRINNING FACE")] + public void Length(string text, int expected, string name) + { + var length = new LengthFunction(); + var result = length.Execute(new JmesPathFunctionArgument(JToken.FromObject(text))); + + Assert.Equal(expected, result.Value()); + } + } +} \ No newline at end of file From a5cb4ad40f9b8c4aa6e348c64266f52bcfba10a4 Mon Sep 17 00:00:00 2001 From: springcomp Date: Tue, 18 Oct 2022 14:28:29 +0200 Subject: [PATCH 2/6] [unicode] The `sort()` function learned to correctly handle Unicode surrogate pairs and composite characters. --- src/jmespath.net/Functions/SortFunction.cs | 27 ++++-- src/jmespath.net/Utils/CodePointEnumerator.cs | 89 +++++++++++++++++++ src/jmespath.net/Utils/Text.cs | 89 ++++++++++++++++++- src/jmespath.net/Utils/TextComparer.cs | 13 +++ src/jmespath.net/Utils/TextEnumerator.cs | 40 +++++++++ .../Utils/StringFunctionsTest.cs | 24 ++--- tests/jmespathnet.tests/Utils/TextTest.cs | 57 ++++++++++++ 7 files changed, 317 insertions(+), 22 deletions(-) create mode 100644 src/jmespath.net/Utils/CodePointEnumerator.cs create mode 100644 src/jmespath.net/Utils/TextComparer.cs create mode 100644 src/jmespath.net/Utils/TextEnumerator.cs create mode 100644 tests/jmespathnet.tests/Utils/TextTest.cs diff --git a/src/jmespath.net/Functions/SortFunction.cs b/src/jmespath.net/Functions/SortFunction.cs index 247ca6d..77a8f39 100644 --- a/src/jmespath.net/Functions/SortFunction.cs +++ b/src/jmespath.net/Functions/SortFunction.cs @@ -1,4 +1,5 @@ using System; +using System.Collections.Generic; using System.Linq; using DevLab.JmesPath.Utils; using Newtonsoft.Json.Linq; @@ -28,18 +29,26 @@ public override JToken Execute(params JmesPathFunctionArgument[] args) var item = array[0]; if (item.Type == JTokenType.Float) - return new JArray().AddRange(Sort(array)); + return JArray.FromObject(SortNumber(array)); else if (item.Type == JTokenType.Integer) - return new JArray().AddRange(Sort(array)); + return JArray.FromObject(SortNumber(array)); else - return new JArray().AddRange(Sort(array)); + return JArray.FromObject(SortText(array)); } - private static JToken[] Sort(JArray array) - { - return array - .OrderBy(u => u.Value()) - .ToArray(); - } + internal static T[] SortNumber(JArray array) + => array + .Values() + .OrderBy(u => u) + .ToArray() + ; + + internal static string[] SortText(JArray array) + => array + .Select(u => (Text)u.Value()) + .OrderBy(u => u, Text.CodePointComparer) + .Select(u => (string)u) + .ToArray() + ; } } \ No newline at end of file diff --git a/src/jmespath.net/Utils/CodePointEnumerator.cs b/src/jmespath.net/Utils/CodePointEnumerator.cs new file mode 100644 index 0000000..09c13fc --- /dev/null +++ b/src/jmespath.net/Utils/CodePointEnumerator.cs @@ -0,0 +1,89 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Globalization; + +namespace DevLab.JmesPath.Utils +{ + /// + /// An implementation + /// that enumerates Unicode codepoints in a string. + /// + internal sealed class CodePointEnumerator : IEnumerator + { + private readonly string text_; + + private int[] codePoints_; + private int index_ = -1; + + /// + /// Initialize a new instance of the class. + /// + /// + public CodePointEnumerator(Text text) + { + text_ = text; + codePoints_ = GetCodePoints(text); + } + + public IEnumerable AsEnumerable() + { + while (MoveNext()) + yield return Current; + } + + public int Current + => codePoints_[index_]; + + object IEnumerator.Current + => Current; + + public bool MoveNext() + => ++index_ < codePoints_.Length; + + public void Reset() + { + index_ = -1; + } + + public void Dispose() { } + + private static int[] GetCodePoints(Text text) + { + var codePoints = new List(text.Length); + + var enumerator = StringInfo.GetTextElementEnumerator(text); + while (enumerator.MoveNext()) + { + var element = enumerator.GetTextElement(); + if (element.Length == 1) + { + codePoints.Add(element[0]); + } + + else + { + // element represents either a codepoint from a supplementary plane + // encoded as a surrogate pair of UTF-16 code units. + + // or a composite character encoded as two codepoints. + + System.Diagnostics.Debug.Assert(element.Length == 2); + + if (Char.IsSurrogatePair(element[0], element[1])) + { + codePoints.Add(Char.ConvertToUtf32(element[0], element[1])); + } + + else + { + codePoints.Add(element[0]); + codePoints.Add(element[1]); + } + } + } + + return codePoints.ToArray(); + } + } +} \ No newline at end of file diff --git a/src/jmespath.net/Utils/Text.cs b/src/jmespath.net/Utils/Text.cs index 55d9d79..e88613e 100644 --- a/src/jmespath.net/Utils/Text.cs +++ b/src/jmespath.net/Utils/Text.cs @@ -1,5 +1,9 @@ using System; +using System.Collections; +using System.Collections.Generic; using System.Globalization; +using System.Linq; +using System.Text; namespace DevLab.JmesPath.Utils { @@ -8,11 +12,14 @@ namespace DevLab.JmesPath.Utils /// If differs from the .NET class in that is correctly /// handles codepoints from supplementary planes, including surrogate pairs. /// - internal class Text + internal sealed partial class Text : IEnumerable, IEquatable, IComparable { private readonly string text_; private readonly StringInfo info_; + private static readonly IComparer defaultComparer_ + = new TextComparer(); + /// /// Initialize a new instance of the class. /// @@ -23,18 +30,98 @@ public Text(string text) info_ = new StringInfo(text_); } + /// + /// Returns an implementation + /// that compares Text using the numerical value of its codepoints. + /// + public static IComparer CodePointComparer + => defaultComparer_; + + /// + /// Initialize a new instance of the class. + /// + /// + public Text(params int[] codePoints) + { + var sb = new StringBuilder(); + foreach (var codePoint in codePoints) + sb.Append(Char.ConvertFromUtf32(codePoint)); + + text_ = sb.ToString(); + info_ = new StringInfo(text_); + } + /// /// The number of Unicode codepoints. /// public int Length => info_.LengthInTextElements; + /// + /// Returns a enumerator over the sequence of Unicode codepoints. + /// + /// + public IEnumerator GetCodePointsEnumerator() + => new CodePointEnumerator(this); + + /// + /// The sequence of Unicode codepoints. + /// + public IEnumerable CodePoints + => new CodePointEnumerator(this).AsEnumerable(); + public static implicit operator String(Text text) => text.ToString(); public static explicit operator Text(string text) => new Text(text); + public IEnumerator GetEnumerator() + => new TextEnumerator(text_); + + /// + /// Returns true if the two strings are equal + /// i.e. if the two sequences of Unicode codepoints + /// are identical. + /// + /// + /// + public bool Equals(Text other) + => CompareTo(other) == 0; + + /// + /// Compares the two sequences of Unicode codepoints. + /// A string will sort based on the numerical value + /// of its first differring codepoint. + /// + /// + /// + /// + public int CompareTo(Text other) + { + var length = Math.Min(this.Length, other.Length); + var codePoints = this.CodePoints.ToArray(); + var otherCodePoints = other.CodePoints.ToArray(); + + for (var index = 0; index < length; index++) + { + if (codePoints[index] < otherCodePoints[index]) + return -1; + else if (codePoints[index] > otherCodePoints[index]) + return 1; + } + + if (codePoints.Length < otherCodePoints.Length) + return -1; + else if (codePoints.Length > otherCodePoints.Length) + return 1; + + return 0; + } + + IEnumerator IEnumerable.GetEnumerator() + => GetEnumerator(); + public override string ToString() => text_; } diff --git a/src/jmespath.net/Utils/TextComparer.cs b/src/jmespath.net/Utils/TextComparer.cs new file mode 100644 index 0000000..e85abaf --- /dev/null +++ b/src/jmespath.net/Utils/TextComparer.cs @@ -0,0 +1,13 @@ +using System.Collections.Generic; + +namespace DevLab.JmesPath.Utils +{ + internal sealed partial class Text + { + public sealed class TextComparer : IComparer + { + int IComparer.Compare(Text x, Text y) + => x.CompareTo(y); + } + } +} \ No newline at end of file diff --git a/src/jmespath.net/Utils/TextEnumerator.cs b/src/jmespath.net/Utils/TextEnumerator.cs new file mode 100644 index 0000000..d5b4dc2 --- /dev/null +++ b/src/jmespath.net/Utils/TextEnumerator.cs @@ -0,0 +1,40 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Globalization; + +namespace DevLab.JmesPath.Utils +{ + /// + /// An implementation + /// that enumerates Unicode characters in a string while + /// correctly handling surrogate pairs. + /// + internal sealed class TextEnumerator : IEnumerator + { + private readonly TextElementEnumerator enum_; + + /// + /// Initialize a new instance of the class. + /// + /// + public TextEnumerator(String text) + { + enum_ = StringInfo.GetTextElementEnumerator(text); + } + + public string Current + => enum_.GetTextElement(); + + object IEnumerator.Current + => Current; + + public bool MoveNext() + => enum_.MoveNext(); + + public void Reset() + => enum_.Reset(); + + public void Dispose() { } + } +} \ No newline at end of file diff --git a/tests/jmespathnet.tests/Utils/StringFunctionsTest.cs b/tests/jmespathnet.tests/Utils/StringFunctionsTest.cs index a250169..f6ae2ee 100644 --- a/tests/jmespathnet.tests/Utils/StringFunctionsTest.cs +++ b/tests/jmespathnet.tests/Utils/StringFunctionsTest.cs @@ -1,21 +1,10 @@ using DevLab.JmesPath.Functions; -using DevLab.JmesPath.Utils; using Newtonsoft.Json.Linq; +using System.Linq; using Xunit; namespace jmespath.net.tests.Utils { - public sealed class TextTest - { - [Theory] - [InlineData("𝌆", 1, "U+1D306 TETRAGRAM FOR CENTER")] - [InlineData("😀", 1, "U+1F600 GRINNING FACE")] - public void Length(string text, int expected, string name) - { - var t = new Text(text); - Assert.Equal(expected, t.Length); - } - } public sealed class StringFunctionsTest { @@ -29,5 +18,16 @@ public void Length(string text, int expected, string name) Assert.Equal(expected, result.Value()); } + + [Theory] + [InlineData(new[] { "𝌆", "\xfb06", "\xfb06yle", "\xfb03" }, new[] { "\xfb03", "\xfb06", "\xfb06yle", "𝌆" })] + public void Sort(string[] strings, string[] expected) + { + var sort = new SortFunction(); + var result = (JArray) sort.Execute(new JmesPathFunctionArgument(JArray.FromObject(strings))); + var actual = result.Select(u => u.Value()).ToArray(); + + Assert.True(Enumerable.SequenceEqual(expected, actual)); + } } } \ No newline at end of file diff --git a/tests/jmespathnet.tests/Utils/TextTest.cs b/tests/jmespathnet.tests/Utils/TextTest.cs new file mode 100644 index 0000000..58a0de0 --- /dev/null +++ b/tests/jmespathnet.tests/Utils/TextTest.cs @@ -0,0 +1,57 @@ +using DevLab.JmesPath.Utils; +using System.Collections.Generic; +using System.Linq; +using Xunit; + +namespace jmespath.net.tests.Utils +{ + public sealed class TextTest + { + [Theory] + [InlineData("𝌆", 1, "U+1D306 TETRAGRAM FOR CENTER")] + [InlineData("😀", 1, "U+1F600 GRINNING FACE")] + public void Length(string text, int expected, string name) + { + var t = new Text(text); + Assert.Equal(expected, t.Length); + } + + [Theory] + [InlineData("a😀b", new[] { "a", "😀", "b" })] + public void AsEnumerable(string text, string[] array) + { + var t = new Text(text); + + var collection = new List(); + foreach (var uc in t) + collection.Add(uc); + + Assert.True(Enumerable.SequenceEqual(collection, array)); + } + + [Theory] + [InlineData("a😀b", new[] { 0x61, 0x1f600, 0x62 })] + [InlineData("élément", new[] { 0xe9, 0x6c, 0xe9, 0x6d, 0x65, 0x6e, 0x74 })] + [InlineData("e\x0301lément", new[] { 0x65, 0x0301, 0x6c, 0xe9, 0x6d, 0x65, 0x6e, 0x74 })] + public void CodePoints(string text, int[] array) + { + var t = new Text(text); + var codePoints = t.CodePoints.ToArray(); + Assert.True(Enumerable.SequenceEqual(codePoints, array)); + } + + [Theory] + [InlineData(new[] { 0x61, 0x1f600, 0x62 }, "a😀b")] + [InlineData(new[] { 0xe9, 0x6c, 0xe9, 0x6d, 0x65, 0x6e, 0x74 }, "élément")] + [InlineData(new[] { 0x65, 0x0301, 0x6c, 0xe9, 0x6d, 0x65, 0x6e, 0x74 }, "e\x0301lément")] + public void FromCodePoints(int[] array, string expected) + => Assert.Equal(expected, new Text(array)); + + [Theory] + [InlineData(new[] { "less than", "less than or equal"}, -1)] + [InlineData(new[] { "identical", "identical"}, 0)] + [InlineData(new[] { "greater than", "greater"}, 1)] + public void Compare(string[] texts, int expected) + => Assert.Equal(expected, new Text(texts[0]).CompareTo(new Text(texts[1]))); + } +} \ No newline at end of file From a4b33c19ffbc93c7f6285ab6e2046faeba5194e6 Mon Sep 17 00:00:00 2001 From: springcomp Date: Tue, 18 Oct 2022 15:45:44 +0200 Subject: [PATCH 3/6] [unicode] The `sort_by()` function learned to correctly handle Unicode surrogate pairs and composite characters. --- src/jmespath.net/Functions/SortByFunction.cs | 84 ++++++++++++++----- .../Utils/StringFunctionsTest.cs | 26 +++++- 2 files changed, 89 insertions(+), 21 deletions(-) diff --git a/src/jmespath.net/Functions/SortByFunction.cs b/src/jmespath.net/Functions/SortByFunction.cs index 444e5ad..a9e6b2d 100644 --- a/src/jmespath.net/Functions/SortByFunction.cs +++ b/src/jmespath.net/Functions/SortByFunction.cs @@ -1,5 +1,7 @@ using System; +using System.Collections.Generic; using System.Linq; +using DevLab.JmesPath.Expressions; using DevLab.JmesPath.Utils; using Newtonsoft.Json.Linq; @@ -21,35 +23,77 @@ public override JToken Execute(params JmesPathFunctionArgument[] args) var array = (JArray)args[0].Token; var expression = args[1].Expression; - var done = false; + if (array.Count == 0) + return new JArray(); - var expectedItemType = "none"; + // make sure this is an homogeneous array + // with all items from a single expected type - var ordered = array.OrderBy(u => - { - var e = expression.Transform(u); + var keyCollection = array + .Select(u => expression.Transform(u).AsJToken()) + .ToArray() + ; + + var actualItemType = keyCollection[0].GetTokenType(); + if (actualItemType != "number" && actualItemType != "string") + throw new Exception($"Error: invalid-type, the expression argument of function {Name} should return a number or a string."); + + if (keyCollection.Any(k => k.GetTokenType() != actualItemType)) + throw new Exception($"Error: invalid-type, all items resulting from the evaluation of the expression argument of function {Name} should have the same type."); - var actualItemType = e.AsJToken().GetTokenType(); + // sort array - if (!done) - { - if (actualItemType != "number" && actualItemType != "string") - throw new Exception($"Error: invalid-type, the expression argument of function {Name} should return a number or a string."); + var tokens = array.AsEnumerable().ToArray(); + JToken[] ordered = tokens; - expectedItemType = actualItemType; - done = true; - } + if (actualItemType == "number") + { + var actualKeyTokenType = keyCollection[0].Type; + if (actualKeyTokenType == JTokenType.Float) + ordered = SortByNumbers(tokens, expression); + else if (actualKeyTokenType == JTokenType.Integer) + ordered = SortByNumbers(tokens, expression); + } + else + { + ordered = SortByText(tokens, expression); + } - if (expectedItemType != actualItemType) - throw new Exception("Error: invalid-type, all items resulting from the evaluation of the expression argument of function {Name} should have the same type."); - - return e.AsJToken(); + return new JArray(ordered); + } - }).ToArray(); + private JToken[] SortByNumbers(JToken[] array, JmesPathExpression expression) + { + T keySelector(JToken t) { + var token = expression.Transform(t).AsJToken(); + return token.Value(); + }; - return new JArray() - .AddRange(ordered) + var ordered = array + .OrderBy(keySelector) + .ToArray() ; + + return ordered; + } + private JToken[] SortByText(JToken[] array, JmesPathExpression expression) + { + Text keySelector(JToken t) + { + var key = expression.Transform(t).AsJToken(); + return (Text) key.Value(); + }; + IComparer comparer = Text.CodePointComparer; + + var ordered = array + .OrderBy( + keySelector, + comparer + ) + .ToArray() + ; + + return ordered; } } } \ No newline at end of file diff --git a/tests/jmespathnet.tests/Utils/StringFunctionsTest.cs b/tests/jmespathnet.tests/Utils/StringFunctionsTest.cs index f6ae2ee..faa042e 100644 --- a/tests/jmespathnet.tests/Utils/StringFunctionsTest.cs +++ b/tests/jmespathnet.tests/Utils/StringFunctionsTest.cs @@ -1,4 +1,5 @@ -using DevLab.JmesPath.Functions; +using DevLab.JmesPath.Expressions; +using DevLab.JmesPath.Functions; using Newtonsoft.Json.Linq; using System.Linq; using Xunit; @@ -29,5 +30,28 @@ public void Sort(string[] strings, string[] expected) Assert.True(Enumerable.SequenceEqual(expected, actual)); } + + [Theory] + [InlineData(new[] { "𝌆", "\xfb06", "\xfb06yle", "\xfb03" }, new[] { "\xfb03", "\xfb06", "\xfb06yle", "𝌆" })] + public void SortBy(string[] strings, string[] expected) + { + var sortBy = new SortByFunction(); + + JToken MakeObject(string text) + => JToken.Parse($"{{\"foo\": \"{text}\"}}"); + + var expectedArray = JArray.FromObject(expected.Select(MakeObject).ToArray()); + var inputArray = JArray.FromObject(strings.Select(MakeObject).ToArray()); + + var by = new JmesPathIdentifier("foo"); + JmesPathExpression.MakeExpressionType(by); + + var actualArray = (JArray) sortBy.Execute( + new JmesPathFunctionArgument(inputArray), + new JmesPathFunctionArgument(by) + ); + + Assert.True(JToken.DeepEquals(expectedArray, actualArray)); + } } } \ No newline at end of file From 3b933a544fb70c4e8b02c5be5b47976de5d75a08 Mon Sep 17 00:00:00 2001 From: springcomp Date: Tue, 18 Oct 2022 15:51:49 +0200 Subject: [PATCH 4/6] [unicode] The function `reverse()` learned to correctly handle Unicode surrogate pairs and composite characters. --- src/jmespath.net/Functions/ReverseFunction.cs | 15 +++++++------- src/jmespath.net/Utils/Text.cs | 14 ++++++------- .../Utils/StringFunctionsTest.cs | 20 +++++++++++++++++-- 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/src/jmespath.net/Functions/ReverseFunction.cs b/src/jmespath.net/Functions/ReverseFunction.cs index 817fe91..040008a 100644 --- a/src/jmespath.net/Functions/ReverseFunction.cs +++ b/src/jmespath.net/Functions/ReverseFunction.cs @@ -28,15 +28,16 @@ public override JToken Execute(params JmesPathFunctionArgument[] args) switch (token.GetTokenType()) { case "string": - { - var characters = token.Value().Reverse().ToArray(); - return new JValue(new string(characters)); - } + { + var text = (Text)token.Value(); + var reversed = new Text(text.CodePoints.Reverse().ToArray()); + return new JValue((string)reversed); + } case "array": - { + { var items = ((JArray)token).Reverse(); - return new JArray().AddRange(items); - } + return new JArray().AddRange(items); + } default: return null; } diff --git a/src/jmespath.net/Utils/Text.cs b/src/jmespath.net/Utils/Text.cs index e88613e..78f0fb9 100644 --- a/src/jmespath.net/Utils/Text.cs +++ b/src/jmespath.net/Utils/Text.cs @@ -30,13 +30,6 @@ public Text(string text) info_ = new StringInfo(text_); } - /// - /// Returns an implementation - /// that compares Text using the numerical value of its codepoints. - /// - public static IComparer CodePointComparer - => defaultComparer_; - /// /// Initialize a new instance of the class. /// @@ -51,6 +44,13 @@ public Text(params int[] codePoints) info_ = new StringInfo(text_); } + /// + /// Returns an implementation + /// that compares Text using the numerical value of its codepoints. + /// + public static IComparer CodePointComparer + => defaultComparer_; + /// /// The number of Unicode codepoints. /// diff --git a/tests/jmespathnet.tests/Utils/StringFunctionsTest.cs b/tests/jmespathnet.tests/Utils/StringFunctionsTest.cs index faa042e..392a697 100644 --- a/tests/jmespathnet.tests/Utils/StringFunctionsTest.cs +++ b/tests/jmespathnet.tests/Utils/StringFunctionsTest.cs @@ -25,7 +25,7 @@ public void Length(string text, int expected, string name) public void Sort(string[] strings, string[] expected) { var sort = new SortFunction(); - var result = (JArray) sort.Execute(new JmesPathFunctionArgument(JArray.FromObject(strings))); + var result = (JArray)sort.Execute(new JmesPathFunctionArgument(JArray.FromObject(strings))); var actual = result.Select(u => u.Value()).ToArray(); Assert.True(Enumerable.SequenceEqual(expected, actual)); @@ -46,12 +46,28 @@ JToken MakeObject(string text) var by = new JmesPathIdentifier("foo"); JmesPathExpression.MakeExpressionType(by); - var actualArray = (JArray) sortBy.Execute( + var actualArray = (JArray)sortBy.Execute( new JmesPathFunctionArgument(inputArray), new JmesPathFunctionArgument(by) ); Assert.True(JToken.DeepEquals(expectedArray, actualArray)); } + + [Theory] + [InlineData("a𝌆b", "b𝌆a")] + + public void Reverse(string text, string expected) + { + var reverse = new ReverseFunction(); + var argument = new JmesPathFunctionArgument(JToken.FromObject(text)); + + var actual = reverse + .Execute(argument) + .Value() + ; + + Assert.Equal(expected, actual); + } } } \ No newline at end of file From f23a6f75651ec9e6e97239da0bb5513f38135112 Mon Sep 17 00:00:00 2001 From: springcomp Date: Wed, 19 Oct 2022 08:15:55 +0200 Subject: [PATCH 5/6] [unicode] Updated compliance tests. --- tools/jmespathnet.compliance/jmespath.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/jmespathnet.compliance/jmespath.test b/tools/jmespathnet.compliance/jmespath.test index 92f235b..0be0a71 160000 --- a/tools/jmespathnet.compliance/jmespath.test +++ b/tools/jmespathnet.compliance/jmespath.test @@ -1 +1 @@ -Subproject commit 92f235b1237bf68c84b53679a45cb04eec0935d0 +Subproject commit 0be0a71faefff2bd83cc6b29d85bb996e72d4474 From 2b62110b8f1b5e4f041e59d87aad333fe2f3d25d Mon Sep 17 00:00:00 2001 From: springcomp Date: Wed, 19 Oct 2022 09:26:11 +0200 Subject: [PATCH 6/6] [unicode] Fixed `length()` function. --- src/jmespath.net/Utils/CodePointEnumerator.cs | 32 +++++++------------ src/jmespath.net/Utils/Text.cs | 6 ++-- tests/jmespathnet.tests/Utils/TextTest.cs | 1 + 3 files changed, 15 insertions(+), 24 deletions(-) diff --git a/src/jmespath.net/Utils/CodePointEnumerator.cs b/src/jmespath.net/Utils/CodePointEnumerator.cs index 09c13fc..dc6b4b8 100644 --- a/src/jmespath.net/Utils/CodePointEnumerator.cs +++ b/src/jmespath.net/Utils/CodePointEnumerator.cs @@ -48,38 +48,28 @@ public void Reset() public void Dispose() { } - private static int[] GetCodePoints(Text text) + internal static int[] GetCodePoints(Text text) { - var codePoints = new List(text.Length); + var codePoints = new List(); var enumerator = StringInfo.GetTextElementEnumerator(text); while (enumerator.MoveNext()) { var element = enumerator.GetTextElement(); - if (element.Length == 1) + + // element represents either a codepoint from the basic multilingual plane + // or a supplementary plane encoded as a pair of surrogate UTF-16 code units. + + if (element.Length > 1 && (Char.IsSurrogatePair(element[0], element[1]))) { - codePoints.Add(element[0]); + System.Diagnostics.Debug.Assert(element.Length == 2); + codePoints.Add(Char.ConvertToUtf32(element[0], element[1])); } else { - // element represents either a codepoint from a supplementary plane - // encoded as a surrogate pair of UTF-16 code units. - - // or a composite character encoded as two codepoints. - - System.Diagnostics.Debug.Assert(element.Length == 2); - - if (Char.IsSurrogatePair(element[0], element[1])) - { - codePoints.Add(Char.ConvertToUtf32(element[0], element[1])); - } - - else - { - codePoints.Add(element[0]); - codePoints.Add(element[1]); - } + foreach (var codePoint in element) + codePoints.Add(codePoint); } } diff --git a/src/jmespath.net/Utils/Text.cs b/src/jmespath.net/Utils/Text.cs index 78f0fb9..d2304cb 100644 --- a/src/jmespath.net/Utils/Text.cs +++ b/src/jmespath.net/Utils/Text.cs @@ -55,7 +55,7 @@ public static IComparer CodePointComparer /// The number of Unicode codepoints. /// public int Length - => info_.LengthInTextElements; + => CodePoints.Length; /// /// Returns a enumerator over the sequence of Unicode codepoints. @@ -67,8 +67,8 @@ public IEnumerator GetCodePointsEnumerator() /// /// The sequence of Unicode codepoints. /// - public IEnumerable CodePoints - => new CodePointEnumerator(this).AsEnumerable(); + public int[] CodePoints + => CodePointEnumerator.GetCodePoints(this); public static implicit operator String(Text text) => text.ToString(); diff --git a/tests/jmespathnet.tests/Utils/TextTest.cs b/tests/jmespathnet.tests/Utils/TextTest.cs index 58a0de0..3e223a2 100644 --- a/tests/jmespathnet.tests/Utils/TextTest.cs +++ b/tests/jmespathnet.tests/Utils/TextTest.cs @@ -10,6 +10,7 @@ public sealed class TextTest [Theory] [InlineData("𝌆", 1, "U+1D306 TETRAGRAM FOR CENTER")] [InlineData("😀", 1, "U+1F600 GRINNING FACE")] + [InlineData("e\u0301", 2, "U+0065 LATIN SMALL LETTER E, U+0301 COMBINING ACUTE ACCENT")] public void Length(string text, int expected, string name) { var t = new Text(text);