Skip to content

Commit

Permalink
Handle surrogate pairs during scanning (dart-lang/yaml#159)
Browse files Browse the repository at this point in the history
Change back to readChar() whenever possible; remove the need to decode the surrogate for further checking
  • Loading branch information
tamcy authored Jul 20, 2024
1 parent 7440807 commit a0f9e57
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 29 deletions.
1 change: 1 addition & 0 deletions pkgs/yaml/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
## 3.1.3-wip

* Require Dart 3.4
* Fix UTF-16 surrogate pair handling in plain scaler.

## 3.1.2

Expand Down
61 changes: 37 additions & 24 deletions pkgs/yaml/lib/src/scanner.dart
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ class Scanner {
null => false,
LF || CR || BOM => false,
TAB || NEL => true,
_ => _isStandardCharacter(char),
_ => _isStandardCharacterAt(0),
};
}

Expand All @@ -267,7 +267,7 @@ class Scanner {
null => false,
LF || CR || BOM || SP => false,
NEL => true,
_ => _isStandardCharacter(char),
_ => _isStandardCharacterAt(0),
};
}

Expand Down Expand Up @@ -614,9 +614,9 @@ class Scanner {

// Consume the indicator token.
var start = _scanner.state;
_scanner.readChar();
_scanner.readChar();
_scanner.readChar();
_scanner.readCodePoint();
_scanner.readCodePoint();
_scanner.readCodePoint();

_tokens.add(Token(type, _scanner.spanFrom(start)));
}
Expand Down Expand Up @@ -732,7 +732,7 @@ class Scanner {
/// The span of the new token is the current character.
void _addCharToken(TokenType type) {
var start = _scanner.state;
_scanner.readChar();
_scanner.readCodePoint();
_tokens.add(Token(type, _scanner.spanFrom(start)));
}

Expand Down Expand Up @@ -836,7 +836,7 @@ class Scanner {
// libyaml doesn't support unknown directives, but the spec says to ignore
// them and warn: http://yaml.org/spec/1.2/spec.html#id2781147.
while (!_isBreakOrEnd) {
_scanner.readChar();
_scanner.readCodePoint();
}

return null;
Expand Down Expand Up @@ -866,7 +866,7 @@ class Scanner {
// disagrees: http://yaml.org/spec/1.2/spec.html#ns-directive-name.
var start = _scanner.position;
while (_isNonSpace) {
_scanner.readChar();
_scanner.readCodePoint();
}

var name = _scanner.substring(start);
Expand Down Expand Up @@ -941,13 +941,13 @@ class Scanner {
var start = _scanner.state;

// Eat the indicator character.
_scanner.readChar();
_scanner.readCodePoint();

// libyaml only allows word characters in anchor names, but the spec
// disagrees: http://yaml.org/spec/1.2/spec.html#ns-anchor-char.
var startPosition = _scanner.position;
while (_isAnchorChar) {
_scanner.readChar();
_scanner.readCodePoint();
}
var name = _scanner.substring(startPosition);

Expand Down Expand Up @@ -1032,7 +1032,7 @@ class Scanner {
buffer.write(_scanner.substring(start));

if (_scanner.peekChar() == EXCLAMATION) {
buffer.writeCharCode(_scanner.readChar());
buffer.writeCharCode(_scanner.readCodePoint());
} else {
// It's either the '!' tag or not really a tag handle. If it's a %TAG
// directive, it's an error. If it's a tag token, it must be part of a
Expand Down Expand Up @@ -1083,15 +1083,15 @@ class Scanner {
var start = _scanner.state;

// Eat the indicator '|' or '>'.
_scanner.readChar();
_scanner.readCodePoint();

// Check for a chomping indicator.
var chomping = _Chomping.clip;
var increment = 0;
var char = _scanner.peekChar();
if (char == PLUS || char == HYPHEN) {
chomping = char == PLUS ? _Chomping.keep : _Chomping.strip;
_scanner.readChar();
_scanner.readCodePoint();

// Check for an indentation indicator.
if (_isDigit) {
Expand All @@ -1101,7 +1101,7 @@ class Scanner {
_scanner.spanFrom(start));
}

increment = _scanner.readChar() - NUMBER_0;
increment = _scanner.readCodePoint() - NUMBER_0;
}
} else if (_isDigit) {
// Do the same as above, but in the opposite order.
Expand All @@ -1110,12 +1110,12 @@ class Scanner {
_scanner.spanFrom(start));
}

increment = _scanner.readChar() - NUMBER_0;
increment = _scanner.readCodePoint() - NUMBER_0;

char = _scanner.peekChar();
if (char == PLUS || char == HYPHEN) {
chomping = char == PLUS ? _Chomping.keep : _Chomping.strip;
_scanner.readChar();
_scanner.readCodePoint();
}
}

Expand Down Expand Up @@ -1182,7 +1182,7 @@ class Scanner {

var startPosition = _scanner.position;
while (!_isBreakOrEnd) {
_scanner.readChar();
_scanner.readCodePoint();
}
buffer.write(_scanner.substring(startPosition));
end = _scanner.state;
Expand Down Expand Up @@ -1373,7 +1373,7 @@ class Scanner {
buffer.writeCharCode(value);
}
} else {
buffer.writeCharCode(_scanner.readChar());
buffer.writeCharCode(_scanner.readCodePoint());
}
}

Expand Down Expand Up @@ -1462,7 +1462,7 @@ class Scanner {
// 1.2's. We use [_isPlainChar] instead of libyaml's character here.
var startPosition = _scanner.position;
while (_isPlainChar) {
_scanner.readChar();
_scanner.readCodePoint();
}
buffer.write(_scanner.substring(startPosition));
end = _scanner.state;
Expand Down Expand Up @@ -1587,15 +1587,28 @@ class Scanner {
_inBlockContext,
SP || TAB || LF || CR || BOM => false,
NEL => true,
_ => _isStandardCharacter(char)
_ => _isStandardCharacterAt(offset)
};
}

bool _isStandardCharacterAt(int offset) {
var first = _scanner.peekChar(offset);
if (first == null) return false;

if (isHighSurrogate(first)) {
var next = _scanner.peekChar(offset + 1);
// A surrogate pair encodes code points from U+010000 to U+10FFFF, so it
// must be a standard character.
return next != null && isLowSurrogate(next);
}

return _isStandardCharacter(first);
}

bool _isStandardCharacter(int char) =>
(char >= 0x00020 && char <= 0x00007E) ||
(char >= 0x000A0 && char <= 0x00D7FF) ||
(char >= 0x0E000 && char <= 0x00FFFD) ||
(char >= 0x10000 && char <= 0x10FFFF);
(char >= 0x0020 && char <= 0x007E) ||
(char >= 0x00A0 && char <= 0xD7FF) ||
(char >= 0xE000 && char <= 0xFFFD);

/// Returns the hexidecimal value of [char].
int _asHex(int char) {
Expand Down
6 changes: 6 additions & 0 deletions pkgs/yaml/lib/src/utils.dart
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,9 @@ YamlWarningCallback yamlWarningCallback = (message, [SourceSpan? span]) {
if (span != null) message = span.message(message);
print(message);
};

/// Whether [codeUnit] is a UTF-16 high surrogate.
bool isHighSurrogate(int codeUnit) => codeUnit >>> 10 == 0x36;

/// Whether [codeUnit] is a UTF-16 low surrogate.
bool isLowSurrogate(int codeUnit) => codeUnit >>> 10 == 0x37;
2 changes: 1 addition & 1 deletion pkgs/yaml/pubspec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ environment:
dependencies:
collection: ^1.15.0
source_span: ^1.8.0
string_scanner: ^1.1.0
string_scanner: ^1.2.0

dev_dependencies:
dart_flutter_team_lints: ^3.0.0
Expand Down
13 changes: 9 additions & 4 deletions pkgs/yaml/test/yaml_test.dart
Original file line number Diff line number Diff line change
Expand Up @@ -420,20 +420,25 @@ void main() {

test('[Example 2.17]', () {
expectYamlLoads({
'unicode': 'Sosa did fine.\u263A',
'unicode': 'Sosa did fine.\u263A \u{1F680}',
'control': '\b1998\t1999\t2000\n',
'hex esc': '\r\n is \r\n',
'single': '"Howdy!" he cried.',
'quoted': " # Not a 'comment'.",
'tie-fighter': '|\\-*-/|'
'tie-fighter': '|\\-*-/|',
'surrogate-pair': 'I \u{D83D}\u{DE03} ️Dart!',
'key-\u{D83D}\u{DD11}': 'Look\u{D83D}\u{DE03}\u{D83C}\u{DF89}surprise!',
}, """
unicode: "Sosa did fine.\\u263A"
unicode: "Sosa did fine.\\u263A \\U0001F680"
control: "\\b1998\\t1999\\t2000\\n"
hex esc: "\\x0d\\x0a is \\r\\n"
single: '"Howdy!" he cried.'
quoted: ' # Not a ''comment''.'
tie-fighter: '|\\-*-/|'""");
tie-fighter: '|\\-*-/|'
surrogate-pair: I \u{D83D}\u{DE03} ️Dart!
key-\u{D83D}\u{DD11}: Look\u{D83D}\u{DE03}\u{D83C}\u{DF89}surprise!""");
});

test('[Example 2.18]', () {
Expand Down

0 comments on commit a0f9e57

Please sign in to comment.