Skip to content

Commit

Permalink
[unicode] reverse() learned not to break composite character sequen…
Browse files Browse the repository at this point in the history
…ces.
  • Loading branch information
springcomp committed Oct 18, 2022
1 parent 3b933a5 commit bd64a60
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 2 deletions.
17 changes: 17 additions & 0 deletions src/jmespath.net/Functions/Impl/TextExtensions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
using DevLab.JmesPath.Utils;
using System;
using System.Linq;

namespace jmespath.net.Functions.Impl
{
internal static class TextExtensions
{
/// <summary>
/// Supports the `reverse()` the function.
/// </summary>
/// <param name="text"></param>
/// <returns></returns>
public static Text Invert(this Text text)
=> (Text)String.Join( "", text.ToArray().Reverse() );
}
}
5 changes: 3 additions & 2 deletions src/jmespath.net/Functions/ReverseFunction.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using System;
using System.Linq;
using DevLab.JmesPath.Utils;
using jmespath.net.Functions.Impl;
using Newtonsoft.Json.Linq;

namespace DevLab.JmesPath.Functions
Expand Down Expand Up @@ -30,8 +31,8 @@ public override JToken Execute(params JmesPathFunctionArgument[] args)
case "string":
{
var text = (Text)token.Value<String>();
var reversed = new Text(text.CodePoints.Reverse().ToArray());
return new JValue((string)reversed);
var reversed = text.Invert();
return new JValue((string) reversed);
}
case "array":
{
Expand Down
19 changes: 19 additions & 0 deletions src/jmespath.net/Utils/Text.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,25 @@ namespace DevLab.JmesPath.Utils
/// The <see cref="Text" /> class represents a sequence of Unicode codepoints.
/// If differs from the .NET <see cref="String" /> class in that is correctly
/// handles codepoints from supplementary planes, including surrogate pairs.
///
/// characters in a .NET string are represented by a 21-bit code value
/// of a character in the Basic Multilingual Plane (0x0000 - 0x10FFFF)
/// it can consists of either:
///
/// - a single 16-bit codepoint (U+0000 to U+FFFF excluding the surrogate range U+D800 to U+DFFF).
/// In that case, a Unicode character, identified by its codepoint, maps to a single 16-bit code unit.
///
/// or
///
/// - a pair of 16-bit surrogate code units (high U+D800 to U+DBFF and low U+DC00 to U+DFFF).
/// In that case, a Unicode character, identifier by its codepoint, maps to a sequence of two 16-bit code units.
///
/// Additionally, some Unicode characters can have multiple representations. For instance, the
/// character 'é' can be encoded using the following two difference sequences of codepoints:
///
/// - 'é' U+00E9 LATIN SMALL LETTER E WITH ACUTE ACCENT.
/// - 'è' U+0065 LATIN SMALL LETTER E, U+0301 COMBINING ACUTE ACCENT.
///
/// </summary>
internal sealed partial class Text : IEnumerable<string>, IEquatable<Text>, IComparable<Text>
{
Expand Down
2 changes: 2 additions & 0 deletions tests/jmespathnet.tests/Utils/StringFunctionsTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ JToken MakeObject(string text)

[Theory]
[InlineData("a𝌆b", "b𝌆a")]
[InlineData("aéb", "béa")]
[InlineData("ae\x0301b", "béa")]

public void Reverse(string text, string expected)
{
Expand Down
2 changes: 2 additions & 0 deletions tests/jmespathnet.tests/Utils/TextTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ public sealed class TextTest
[Theory]
[InlineData("𝌆", 1, "U+1D306 TETRAGRAM FOR CENTER")]
[InlineData("😀", 1, "U+1F600 GRINNING FACE")]
[InlineData("é", 1, "U+00E9 LATIN SMALL LETTER E WITH ACUTE ACCENT")]
[InlineData("è", 1, "U+0065 LATIN SMALL LETTER E, U+0301 COMBINING ACUTE ACCENT")]
public void Length(string text, int expected, string name)
{
var t = new Text(text);
Expand Down

0 comments on commit bd64a60

Please sign in to comment.