Skip to content

Commit

Permalink
Fix JSON serialization for UTF-32 characters.
Browse files Browse the repository at this point in the history
When serializing the data in JSON-compatible form, 4-byte UTF32 characters need to be split into two 2-byte code points..

This change fixes that by introducing new emitter setting `UseUtf16SurrogatePairs`, which is set when JSON-compatible builder is requested.
  • Loading branch information
nahk-ivanov committed Oct 24, 2024
1 parent 7923dd8 commit 6c26d5c
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 4 deletions.
9 changes: 9 additions & 0 deletions YamlDotNet.Test/Serialization/SerializationTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -886,6 +886,15 @@ public void SerializationOfAnchorWorksInJson()
.BeEquivalentTo(@"{""x"": {""z"": {""v"": ""1""}}, ""y"": {""k"": {""z"": {""v"": ""1""}}}}");
}

[Fact]
public void SerializationOfUtf32WorksInJson()
{
var obj = new { TestProperty = "Sea life \U0001F99E" };

SerializerBuilder.JsonCompatible().Build().Serialize(obj).Trim().Should()
.Be(@"{""TestProperty"": ""Sea life \uD83E\uDD9E""}");
}

[Fact]
// Todo: this is actually roundtrip
public void DeserializationOfDefaultsWorkInJson()
Expand Down
18 changes: 16 additions & 2 deletions YamlDotNet/Core/Emitter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ public class Emitter : IEmitter
private bool isWhitespace;
private bool isIndentation;
private readonly bool forceIndentLess;
private readonly bool useUtf16SurrogatePair;
private readonly string newLine;

private bool isDocumentEndWritten;
Expand Down Expand Up @@ -148,6 +149,7 @@ public Emitter(TextWriter output, EmitterSettings settings)
this.maxSimpleKeyLength = settings.MaxSimpleKeyLength;
this.skipAnchorName = settings.SkipAnchorName;
this.forceIndentLess = !settings.IndentSequences;
this.useUtf16SurrogatePair = settings.UseUtf16SurrogatePairs;
this.newLine = settings.NewLine;

this.output = output;
Expand Down Expand Up @@ -1189,8 +1191,20 @@ private void WriteDoubleQuotedScalar(string value, bool allowBreaks)
{
if (index + 1 < value.Length && IsLowSurrogate(value[index + 1]))
{
Write('U');
Write(char.ConvertToUtf32(character, value[index + 1]).ToString("X08", CultureInfo.InvariantCulture));
if (useUtf16SurrogatePair)
{
Write('u');
Write(code.ToString("X04", CultureInfo.InvariantCulture));
Write('\\');
Write('u');
Write(((ushort)value[index + 1]).ToString("X04", CultureInfo.InvariantCulture));
}
else
{
Write('U');
Write(char.ConvertToUtf32(character, value[index + 1]).ToString("X08", CultureInfo.InvariantCulture));
}

index++;
}
else
Expand Down
32 changes: 31 additions & 1 deletion YamlDotNet/Core/EmitterSettings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,22 @@ public sealed class EmitterSettings
/// </summary>
public bool IndentSequences { get; }

/// <summary>
/// If true, then 4-byte UTF-32 characters are broken into two 2-byte code-points.
/// </summary>
/// <remarks>
/// This ensures compatibility with JSON format, as it does not allow '\Uxxxxxxxxx'
/// and instead expects two escaped 2-byte character '\uxxxx\uxxxx'.
/// </remarks>
public bool UseUtf16SurrogatePairs { get; }

public static readonly EmitterSettings Default = new EmitterSettings();

public EmitterSettings()
{
}

public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxSimpleKeyLength, bool skipAnchorName = false, bool indentSequences = false, string? newLine = null)
public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxSimpleKeyLength, bool skipAnchorName = false, bool indentSequences = false, bool useUtf16SurrogatePairs = false, string? newLine = null)
{
if (bestIndent < 2 || bestIndent > 9)
{
Expand All @@ -92,6 +101,7 @@ public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxS
MaxSimpleKeyLength = maxSimpleKeyLength;
SkipAnchorName = skipAnchorName;
IndentSequences = indentSequences;
UseUtf16SurrogatePairs = useUtf16SurrogatePairs;
NewLine = newLine ?? Environment.NewLine;
}

Expand All @@ -104,6 +114,7 @@ public EmitterSettings WithBestIndent(int bestIndent)
MaxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
UseUtf16SurrogatePairs,
NewLine
);
}
Expand All @@ -117,6 +128,7 @@ public EmitterSettings WithBestWidth(int bestWidth)
MaxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
UseUtf16SurrogatePairs,
NewLine
);
}
Expand All @@ -130,6 +142,7 @@ public EmitterSettings WithMaxSimpleKeyLength(int maxSimpleKeyLength)
maxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
UseUtf16SurrogatePairs,
NewLine
);
}
Expand All @@ -143,6 +156,7 @@ public EmitterSettings WithNewLine(string newLine)
MaxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
UseUtf16SurrogatePairs,
newLine
);
}
Expand All @@ -167,6 +181,7 @@ public EmitterSettings WithoutAnchorName()
MaxSimpleKeyLength,
true,
IndentSequences,
UseUtf16SurrogatePairs,
NewLine
);
}
Expand All @@ -180,6 +195,21 @@ public EmitterSettings WithIndentedSequences()
MaxSimpleKeyLength,
SkipAnchorName,
true,
UseUtf16SurrogatePairs,
NewLine
);
}

public EmitterSettings WithUtf16SurrogatePairs()
{
return new EmitterSettings(
BestIndent,
BestWidth,
IsCanonical,
MaxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
true,
NewLine
);
}
Expand Down
3 changes: 2 additions & 1 deletion YamlDotNet/Serialization/SerializerBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,8 @@ public SerializerBuilder JsonCompatible()
{
this.emitterSettings = this.emitterSettings
.WithMaxSimpleKeyLength(int.MaxValue)
.WithoutAnchorName();
.WithoutAnchorName()
.WithUtf16SurrogatePairs();

return this
.WithTypeConverter(new GuidConverter(true), w => w.InsteadOf<GuidConverter>())
Expand Down

0 comments on commit 6c26d5c

Please sign in to comment.