Skip to content

Commit

Permalink
fix: Include whitespaces in extracted tokens (#13738)
Browse files Browse the repository at this point in the history
Co-authored-by: Travis Patterson <[email protected]>
  • Loading branch information
na-- and MasslessParticle authored Aug 6, 2024
1 parent da63ca7 commit 7683a79
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 56 deletions.
19 changes: 8 additions & 11 deletions pkg/pattern/tokenization/tokenization.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package tokenization

import (
"bytes"
"unsafe"
)

Expand Down Expand Up @@ -29,15 +28,15 @@ type tokenizer struct {
tokens []string
}

func (t *tokenizer) countOrSaveToken(endTokenPos, skip int) {
func (t *tokenizer) countOrSaveToken(endTokenPos int) {
if t.tokens != nil {
// Intentionally written like this and not with append(), so this can
// panic if we ever exceed the preallocated slice size, since that means
// we have a nasty bug in handleNextToken() below.
t.tokens[t.tokenCount] = t.line[t.tpos:endTokenPos]
}
t.tokenCount++
t.tpos = endTokenPos + skip
t.tpos = endTokenPos
}

func (t *tokenizer) handleNextToken() bool {
Expand All @@ -55,7 +54,7 @@ func (t *tokenizer) handleNextToken() bool {
// outside of a quoted string.
case escaped:
if curQuotePos < 0 && delimiters[c] {
t.countOrSaveToken(p, 1)
t.countOrSaveToken(p + 1)
return true
} else {
escaped = false
Expand Down Expand Up @@ -89,7 +88,7 @@ func (t *tokenizer) handleNextToken() bool {
// If we encounter a delimiter outside of a quote, count or save the
// token and skip the delimiter.
case delimiters[c]:
t.countOrSaveToken(p, 1)
t.countOrSaveToken(p + 1)
return true

// Handle likely JSON object keys that have been serialized without
Expand All @@ -108,11 +107,11 @@ func (t *tokenizer) handleNextToken() bool {
// wasn't a delimiter right before the comma.
case t.maybeJSON && p > t.tpos && (c == ':' || c == ',') && p+1 < lineLen:
if c == ':' && t.line[p-1] == '"' && !delimiters[t.line[p+1]] {
t.countOrSaveToken(p+1, 0)
t.countOrSaveToken(p + 1)
return true
}
if c == ',' && t.line[p+1] == '"' {
t.countOrSaveToken(p, 0)
t.countOrSaveToken(p)
return true
}
}
Expand All @@ -126,12 +125,12 @@ func (t *tokenizer) handleNextToken() bool {
// unterminated quote and the quote itself as a single token, and continue
// fairly normally from there.
if curQuotePos > 0 {
t.countOrSaveToken(curQuotePos+1, 0)
t.countOrSaveToken(curQuotePos + 1)
return true
}

if t.tpos < len(t.line) {
t.countOrSaveToken(len(t.line), 0)
t.countOrSaveToken(len(t.line))
return true
}

Expand Down Expand Up @@ -193,8 +192,6 @@ func (t *tokenizer) tokenize() []string {
}

func PreprocessAndTokenize(content []byte) []string {
content = bytes.TrimSpace(content)

t := tokenizer{rawLine: content, maxTokens: 100} // TODO: parametrize maxTokens

return t.tokenize()
Expand Down
Loading

0 comments on commit 7683a79

Please sign in to comment.