Skip to content

Commit

Permalink
Fix/unicode surrogate pairs (#75)
Browse files Browse the repository at this point in the history
* [unicode] The `length()` function learned to correctly handle Unicode surrogate pairs and composite characters.

* [unicode] The `sort()` function learned to correctly handle Unicode surrogate pairs and composite characters.

* [unicode] The `sort_by()` function learned to correctly handle Unicode surrogate pairs and composite characters.

* [unicode] The function `reverse()` learned to correctly handle Unicode surrogate pairs and composite characters.

* [unicode] Updated compliance tests.

* [unicode] Fixed `length()` function.
  • Loading branch information
springcomp authored Oct 20, 2022
1 parent 30f6e4e commit 2b9fbf5
Show file tree
Hide file tree
Showing 12 changed files with 490 additions and 38 deletions.
2 changes: 1 addition & 1 deletion src/jmespath.net/Functions/LengthFunction.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public override JToken Execute(params JmesPathFunctionArgument[] args)
switch (token.GetTokenType())
{
case "string":
return token.Value<String>().Length;
return ((Text)token.Value<String>()).Length;
case "array":
return ((JArray) token).Count;
case "object":
Expand Down
15 changes: 8 additions & 7 deletions src/jmespath.net/Functions/ReverseFunction.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,16 @@ public override JToken Execute(params JmesPathFunctionArgument[] args)
switch (token.GetTokenType())
{
case "string":
{
var characters = token.Value<String>().Reverse().ToArray();
return new JValue(new string(characters));
}
{
var text = (Text)token.Value<String>();
var reversed = new Text(text.CodePoints.Reverse().ToArray());
return new JValue((string)reversed);
}
case "array":
{
{
var items = ((JArray)token).Reverse();
return new JArray().AddRange(items);
}
return new JArray().AddRange(items);
}
default:
return null;
}
Expand Down
84 changes: 64 additions & 20 deletions src/jmespath.net/Functions/SortByFunction.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
using System;
using System.Collections.Generic;
using System.Linq;
using DevLab.JmesPath.Expressions;
using DevLab.JmesPath.Utils;
using Newtonsoft.Json.Linq;

Expand All @@ -21,35 +23,77 @@ public override JToken Execute(params JmesPathFunctionArgument[] args)
var array = (JArray)args[0].Token;
var expression = args[1].Expression;

var done = false;
if (array.Count == 0)
return new JArray();

var expectedItemType = "none";
// make sure this is an homogeneous array
// with all items from a single expected type

var ordered = array.OrderBy(u =>
{
var e = expression.Transform(u);
var keyCollection = array
.Select(u => expression.Transform(u).AsJToken())
.ToArray()
;

var actualItemType = keyCollection[0].GetTokenType();
if (actualItemType != "number" && actualItemType != "string")
throw new Exception($"Error: invalid-type, the expression argument of function {Name} should return a number or a string.");

if (keyCollection.Any(k => k.GetTokenType() != actualItemType))
throw new Exception($"Error: invalid-type, all items resulting from the evaluation of the expression argument of function {Name} should have the same type.");

var actualItemType = e.AsJToken().GetTokenType();
// sort array

if (!done)
{
if (actualItemType != "number" && actualItemType != "string")
throw new Exception($"Error: invalid-type, the expression argument of function {Name} should return a number or a string.");
var tokens = array.AsEnumerable().ToArray();
JToken[] ordered = tokens;

expectedItemType = actualItemType;
done = true;
}
if (actualItemType == "number")
{
var actualKeyTokenType = keyCollection[0].Type;
if (actualKeyTokenType == JTokenType.Float)
ordered = SortByNumbers<double>(tokens, expression);
else if (actualKeyTokenType == JTokenType.Integer)
ordered = SortByNumbers<int>(tokens, expression);
}
else
{
ordered = SortByText(tokens, expression);
}

if (expectedItemType != actualItemType)
throw new Exception("Error: invalid-type, all items resulting from the evaluation of the expression argument of function {Name} should have the same type.");

return e.AsJToken();
return new JArray(ordered);
}

}).ToArray();
private JToken[] SortByNumbers<T>(JToken[] array, JmesPathExpression expression)
{
T keySelector(JToken t) {
var token = expression.Transform(t).AsJToken();
return token.Value<T>();
};

return new JArray()
.AddRange(ordered)
var ordered = array
.OrderBy(keySelector)
.ToArray()
;

return ordered;
}
private JToken[] SortByText(JToken[] array, JmesPathExpression expression)
{
Text keySelector(JToken t)
{
var key = expression.Transform(t).AsJToken();
return (Text) key.Value<string>();
};
IComparer<Text> comparer = Text.CodePointComparer;

var ordered = array
.OrderBy(
keySelector,
comparer
)
.ToArray()
;

return ordered;
}
}
}
27 changes: 18 additions & 9 deletions src/jmespath.net/Functions/SortFunction.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System;
using System.Collections.Generic;
using System.Linq;
using DevLab.JmesPath.Utils;
using Newtonsoft.Json.Linq;
Expand Down Expand Up @@ -28,18 +29,26 @@ public override JToken Execute(params JmesPathFunctionArgument[] args)
var item = array[0];

if (item.Type == JTokenType.Float)
return new JArray().AddRange(Sort<double>(array));
return JArray.FromObject(SortNumber<double>(array));
else if (item.Type == JTokenType.Integer)
return new JArray().AddRange(Sort<int>(array));
return JArray.FromObject(SortNumber<int>(array));
else
return new JArray().AddRange(Sort<string>(array));
return JArray.FromObject(SortText(array));
}

private static JToken[] Sort<T>(JArray array)
{
return array
.OrderBy(u => u.Value<T>())
.ToArray();
}
internal static T[] SortNumber<T>(JArray array)
=> array
.Values<T>()
.OrderBy(u => u)
.ToArray()
;

internal static string[] SortText(JArray array)
=> array
.Select(u => (Text)u.Value<string>())
.OrderBy(u => u, Text.CodePointComparer)
.Select(u => (string)u)
.ToArray()
;
}
}
7 changes: 7 additions & 0 deletions src/jmespath.net/InternalsVisibleTo.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
using System.Runtime.CompilerServices;

#if DEBUG
[assembly: InternalsVisibleTo("jmespathnet.tests, PublicKey=0024000004800000940000000602000000240000525341310004000001000100055796df0ae0f975fabb3455d92c9edfef1e266fe66273a7f42c298406335fef71fdf99f46033f5f1e890fa2c6a5f230bfdd5832aa16eb45af02ad70ff716f97a51ff955abaaa2490da59ece7f2474dd43695c6bc8f1c82d1bb38f166fdfa7716e11291bda347bc8689d5435e68401a9ab5b4e8e49c1074173d21edf4fbda1b1")]
#else
[assembly: InternalsVisibleTo("jmespathnet.tests, PublicKey=002400000480000094000000060200000024000052534131000400000100010055709b8bb177721db5eb5a9e7437bfa5f46251aef5dcf91f4a36a7dcb98e51a8ecf5a37284004fa6694f3471f5dfc82244c9672eb085cd65c7cb75d8251aa971a349d4641b492ca0963b74fd9878a5872d6ccbb7b7ceff82aa3687c240a70b4d5565c7cff5df0a12cdbde58e937320fb302b7ccedff72008f3bec0bee8384dc5")]
#endif
79 changes: 79 additions & 0 deletions src/jmespath.net/Utils/CodePointEnumerator.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
using System;
using System.Collections;
using System.Collections.Generic;
using System.Globalization;

namespace DevLab.JmesPath.Utils
{
/// <summary>
/// An <see cref="IEnumerator{string}" /> implementation
/// that enumerates Unicode codepoints in a string.
/// </summary>
internal sealed class CodePointEnumerator : IEnumerator<int>
{
private readonly string text_;

private int[] codePoints_;
private int index_ = -1;

/// <summary>
/// Initialize a new instance of the <see cref="CodePointEnumerator" /> class.
/// </summary>
/// <param name="text"></param>
public CodePointEnumerator(Text text)
{
text_ = text;
codePoints_ = GetCodePoints(text);
}

public IEnumerable<int> AsEnumerable()
{
while (MoveNext())
yield return Current;
}

public int Current
=> codePoints_[index_];

object IEnumerator.Current
=> Current;

public bool MoveNext()
=> ++index_ < codePoints_.Length;

public void Reset()
{
index_ = -1;
}

public void Dispose() { }

internal static int[] GetCodePoints(Text text)
{
var codePoints = new List<int>();

var enumerator = StringInfo.GetTextElementEnumerator(text);
while (enumerator.MoveNext())
{
var element = enumerator.GetTextElement();

// element represents either a codepoint from the basic multilingual plane
// or a supplementary plane encoded as a pair of surrogate UTF-16 code units.

if (element.Length > 1 && (Char.IsSurrogatePair(element[0], element[1])))
{
System.Diagnostics.Debug.Assert(element.Length == 2);
codePoints.Add(Char.ConvertToUtf32(element[0], element[1]));
}

else
{
foreach (var codePoint in element)
codePoints.Add(codePoint);
}
}

return codePoints.ToArray();
}
}
}
Loading

0 comments on commit 2b9fbf5

Please sign in to comment.