Skip to content

Commit

Permalink
Small changes for string parsing (#140)
Browse files Browse the repository at this point in the history
* Small changes for string parsing

* Update src/GraphQLParser/LexerContext.cs

Co-authored-by: Shane Krueger <[email protected]>

Co-authored-by: Shane Krueger <[email protected]>
  • Loading branch information
sungam3r and Shane32 authored Nov 6, 2021
1 parent cb70e0d commit 8c41c52
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 42 deletions.
12 changes: 6 additions & 6 deletions src/GraphQLParser.Tests/Validation/LexerValidationTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -370,10 +370,10 @@ public void Lex_UnescapedControlChar_Blockstring_ThrowsExceptionWithCorrectMessa
var exception = Should.Throw<GraphQLSyntaxErrorException>(() => "\"\"\"contains unescaped \u0007 control char".Lex());

exception.Message.ShouldBe(
"Syntax Error GraphQL (1:23) Invalid character within BlockString: \\u0007.\n" +
"Syntax Error GraphQL (1:23) Invalid character within block string: \\u0007.\n" +
"1: \"\"\"contains unescaped \\u0007 control char\n" +
" ^\n");
exception.Description.ShouldBe("Invalid character within BlockString: \\u0007.");
exception.Description.ShouldBe("Invalid character within block string: \\u0007.");
exception.Line.ShouldBe(1);
exception.Column.ShouldBe(23);
}
Expand Down Expand Up @@ -412,10 +412,10 @@ public void Lex_UnterminatedBlockString_ThrowsExceptionWithCorrectMessage()
var exception = Should.Throw<GraphQLSyntaxErrorException>(() => "\"\"\"".Lex());

exception.Message.ShouldBe(
"Syntax Error GraphQL (1:4) Unterminated string.\n" +
"Syntax Error GraphQL (1:4) Unterminated block string.\n" +
"1: \"\"\"\n" +
" ^\n");
exception.Description.ShouldBe("Unterminated string.");
exception.Description.ShouldBe("Unterminated block string.");
exception.Line.ShouldBe(1);
exception.Column.ShouldBe(4);
}
Expand All @@ -426,10 +426,10 @@ public void Lex_UnterminatedBlockStringWithText_ThrowsExceptionWithCorrectMessag
var exception = Should.Throw<GraphQLSyntaxErrorException>(() => "\"\"\"no end triple-quote\"\"".Lex());

exception.Message.ShouldBe(
"Syntax Error GraphQL (1:25) Unterminated string.\n" +
"Syntax Error GraphQL (1:25) Unterminated block string.\n" +
"1: \"\"\"no end triple-quote\"\"\n" +
" ^\n");
exception.Description.ShouldBe("Unterminated string.");
exception.Description.ShouldBe("Unterminated block string.");
exception.Line.ShouldBe(1);
exception.Column.ShouldBe(25);
}
Expand Down
82 changes: 46 additions & 36 deletions src/GraphQLParser/LexerContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,9 @@ public Token GetToken()

if (code == '"')
{
if (_currentIndex + 2 < _source.Length && _source.Span[_currentIndex + 1] == '"' && _source.Span[_currentIndex + 2] == '"')
{
return ReadBlockString();
}
else
{
return ReadString();
}
return _currentIndex + 2 < _source.Length && _source.Span[_currentIndex + 1] == '"' && _source.Span[_currentIndex + 2] == '"'
? ReadBlockString()
: ReadString();
}

return Throw_From_GetToken2(code);
Expand Down Expand Up @@ -133,7 +128,11 @@ private Token ReadComment()
int start = _currentIndex;
char code = NextCode();

Span<char> buffer = stackalloc char[4096];
// The buffer on the stack allows to get rid of intermediate heap allocations if the string
// 1) not too long
// or
// 2) does not contain escape sequences.
Span<char> buffer = stackalloc char[Math.Min(_source.Length - _currentIndex + 32, 4096)];
StringBuilder? sb = null;

int index = 0;
Expand All @@ -149,8 +148,7 @@ private Token ReadComment()
}
catch (IndexOutOfRangeException) // fallback to StringBuilder in case of buffer overflow
{
if (sb == null)
sb = new StringBuilder(buffer.Length * 2);
sb ??= new StringBuilder(buffer.Length * 2);

for (int i = 0; i < buffer.Length; ++i)
sb.Append(buffer[i]);
Expand Down Expand Up @@ -181,16 +179,25 @@ private Token ReadComment()
);
}

// TODO: this method can still be optimized no not allocate at all if block string:
//
// 1) not too long
// 2) has no escape sequences
// 3) has no '\r' characters
// 4) has no initial whitespace on each line, ignoring the first line (or, has no '\n' characters)
//
// In this case, ROM for the returned token represents unmodified part of the source ROM,
// so it can be just sliced from '_source' as you can see in more simple ReadString method.
private Token ReadBlockString()
{
int start = _currentIndex += 2;
int start = _currentIndex += 2; // skip ""
char code = NextCode();

Span<char> buffer = stackalloc char[4096];
Span<char> buffer = stackalloc char[Math.Min(_source.Length - _currentIndex + 32, 4096)];
StringBuilder? sb = null;

int index = 0;
bool escape = false; //when the last character was \
bool escape = false; // when the last character was \
bool lastWasCr = false;

while (_currentIndex < _source.Length)
Expand All @@ -200,30 +207,30 @@ private Token ReadBlockString()
Throw_From_ReadBlockString1(code);
}

//check for """
// check for """
if (code == '"' && _currentIndex + 2 < _source.Length && _source.Span[_currentIndex + 1] == '"' && _source.Span[_currentIndex + 2] == '"')
{
//if last character was \ then go ahead and write out the """, skipping the \
// if last character was \ then go ahead and write out the """, skipping the \
if (escape)
{
escape = false;
}
else
{
//end of blockstring
// end of block string
break;
}
}
else if (escape)
{
//last character was \ so write the \ and then retry this character with escaped = false
// last character was \ so write the \ and then retry this character with escaped = false
code = '\\';
_currentIndex--;
escape = false;
}
else if (code == '\\')
{
//this character is a \ so don't write anything yet, but check the next character
// this character is a \ so don't write anything yet, but check the next character
escape = true;
code = NextCode();
lastWasCr = false;
Expand All @@ -237,15 +244,14 @@ private Token ReadBlockString()

if (!(lastWasCr && code == '\n'))
{
//write code
// write code
if (index < buffer.Length)
{
buffer[index++] = code == '\r' ? '\n' : code;
}
else // fallback to StringBuilder in case of buffer overflow
{
if (sb == null)
sb = new StringBuilder(buffer.Length * 2);
sb ??= new StringBuilder(buffer.Length * 2);

for (int i = 0; i < buffer.Length; ++i)
sb.Append(buffer[i]);
Expand All @@ -262,18 +268,18 @@ private Token ReadBlockString()

if (_currentIndex >= _source.Length)
{
Throw_From_ReadString2();
Throw_From_ReadBlockString2();
}
_currentIndex += 2;
_currentIndex += 2; // skip ""

if (sb != null)
{
for (int i = 0; i < index; ++i)
sb.Append(buffer[i]);
}

//at this point, if sb != null, then sb has the whole string, otherwise buffer (of length index) has the whole string
//also, all line termination combinations have been replaced with LF
// at this point, if sb != null, then sb has the whole string, otherwise buffer (of length index) has the whole string
// also, all line termination combinations have been replaced with LF

ROM value;
if (sb != null)
Expand All @@ -297,11 +303,11 @@ private Token ReadBlockString()

static ROM ProcessBuffer(Span<char> buffer)
{
//scan string to determine maximum valid commonIndent value,
//number of initial blank lines, and number of trailing blank lines
// scan string to determine maximum valid commonIndent value,
// number of initial blank lines, and number of trailing blank lines
int commonIndent = int.MaxValue;
int initialBlankLines = 1;
int skipLinesAfter; //skip all text after line ###, as determined by the number of trailing blank lines
int skipLinesAfter; // skip all text after line ###, as determined by the number of trailing blank lines
{
int trailingBlankLines = 0;
int line = 0;
Expand Down Expand Up @@ -347,8 +353,8 @@ static ROM ProcessBuffer(Span<char> buffer)
skipLinesAfter = lines - trailingBlankLines;
}

//step through the input, skipping the initial blank lines and the trailing blank lines,
//and skipping the initial blank characters from the start of each line
// step through the input, skipping the initial blank lines and the trailing blank lines,
// and skipping the initial blank characters from the start of each line
Span<char> output = buffer.Length <= 4096 ? stackalloc char[buffer.Length] : new char[buffer.Length];
int outputIndex = 0;
{
Expand All @@ -373,7 +379,7 @@ static ROM ProcessBuffer(Span<char> buffer)
}
}

//return the string value from the output buffer
// return the string value from the output buffer
return output.Slice(0, outputIndex).ToString();
}
}
Expand All @@ -383,7 +389,7 @@ private Token ReadString()
int start = _currentIndex;
char code = NextCode();

Span<char> buffer = stackalloc char[4096];
Span<char> buffer = stackalloc char[Math.Min(_source.Length - _currentIndex + 32, 4096)];
StringBuilder? sb = null;

int index = 0;
Expand All @@ -404,8 +410,7 @@ private Token ReadString()
}
catch (IndexOutOfRangeException) // fallback to StringBuilder in case of buffer overflow
{
if (sb == null)
sb = new StringBuilder(buffer.Length * 2);
sb ??= new StringBuilder(buffer.Length * 2);

for (int i = 0; i < buffer.Length; ++i)
sb.Append(buffer[i]);
Expand Down Expand Up @@ -453,7 +458,12 @@ private void Throw_From_ReadString2()

private void Throw_From_ReadBlockString1(char code)
{
throw new GraphQLSyntaxErrorException($"Invalid character within BlockString: \\u{(int)code:D4}.", _source, _currentIndex);
throw new GraphQLSyntaxErrorException($"Invalid character within block string: \\u{(int)code:D4}.", _source, _currentIndex);
}

private void Throw_From_ReadBlockString2()
{
throw new GraphQLSyntaxErrorException("Unterminated block string.", _source, _currentIndex);
}

// sets escaped only to true
Expand Down

0 comments on commit 8c41c52

Please sign in to comment.