Skip to content

Commit

Permalink
Merge branch 'johan/functional-ssfs'
Browse files Browse the repository at this point in the history
This merge splits the styled string splitter into multiple functions for
readability and maintainability.

This makes the BenchmarkHighlightedSearch about 10% slower, but I think
it's worth the tradeoff.
  • Loading branch information
walles committed Nov 11, 2023
2 parents c4e9437 + 3676bbf commit cfdf99d
Show file tree
Hide file tree
Showing 4 changed files with 342 additions and 257 deletions.
192 changes: 4 additions & 188 deletions m/ansiTokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -424,194 +424,10 @@ type _StyledString struct {
Style twin.Style
}

type parseState int

const (
initial parseState = iota
justSawEsc
inStyle
gotOsc // OSC = Operating System Command = ESC]
gotOsc8 // ESC]8
gotOsc8Semi // ESC]8;
inUrl // After ESC]8;;
inUrlGotEsc // Expecting a \ now to terminate the URL
)

func styledStringsFromString(s string) styledStringsWithTrailer {
if !strings.ContainsAny(s, "\x1b") {
// This shortcut makes BenchmarkPlainTextSearch() perform a lot better
return styledStringsWithTrailer{
trailer: twin.StyleDefault,
styledStrings: []_StyledString{{
String: s,
Style: twin.StyleDefault,
}},
}
}

trailer := twin.StyleDefault
parts := make([]_StyledString, 1)

state := initial
escIndex := -1 // Byte index into s
partStart := 0 // Byte index into s
urlStart := -1 // Byte index into s
style := twin.StyleDefault
for byteIndex, char := range s {
if state == initial {
if char == '\x1b' {
escIndex = byteIndex
state = justSawEsc
}
continue
} else if state == justSawEsc {
if char == '\x1b' {
escIndex = byteIndex
state = justSawEsc
} else if char == '[' {
state = inStyle
} else if char == ']' {
state = gotOsc
} else {
state = initial
}
continue
} else if state == inStyle {
if char == '\x1b' {
escIndex = byteIndex
state = justSawEsc
} else if (char >= '0' && char <= '9') || char == ';' {
// Stay in style
} else if char == 'm' {
if partStart < escIndex {
// Consume the most recent part
parts = append(parts, _StyledString{
String: s[partStart:escIndex],
Style: style,
})
}

style = updateStyle(style, s[escIndex:byteIndex+1])
partStart = byteIndex + 1 // Next part starts after this 'm'
state = initial
} else if char == 'K' {
ansiStyle := s[escIndex : byteIndex+1]
if ansiStyle != "\x1b[K" && ansiStyle != "\x1b[0K" {
// Not a supported clear operation, just treat the whole thing as plain text
state = initial
continue
}

// Handle clear-to-end-of-line

if partStart < escIndex {
// Consume the most recent part
parts = append(parts, _StyledString{
String: s[partStart:escIndex],
Style: style,
})
}

trailer = style
partStart = byteIndex + 1 // Next part starts after this 'K'
state = initial
} else {
// Unsupported sequence, just treat the whole thing as plain text
state = initial
}
continue
} else if state == gotOsc {
if char == '8' {
state = gotOsc8
} else {
state = initial
}
continue
} else if state == gotOsc8 {
if char == ';' {
state = gotOsc8Semi
} else {
state = initial
}
continue
} else if state == gotOsc8Semi {
if char == ';' {
urlStart = byteIndex + 1
state = inUrl
} else {
state = initial
}
continue
} else if state == inUrl {
// Ref: https://stackoverflow.com/a/1547940/473672
const validChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;="
if char == '\x1b' {
state = inUrlGotEsc
} else if char == '\x07' {
// End of URL

if partStart < escIndex {
// Consume the most recent part
parts = append(parts, _StyledString{
String: s[partStart:escIndex],
Style: style,
})
}
partStart = byteIndex + 1

url := s[urlStart:byteIndex]
style = style.WithHyperlink(&url)
state = initial
} else if strings.ContainsRune(validChars, char) {
// Stay in URL
} else {
// Invalid URL character, just treat the whole thing as plain text
state = initial
}
continue
} else if state == inUrlGotEsc {
if char == '\\' {
// End of URL

if partStart < escIndex {
// Consume the most recent part
parts = append(parts, _StyledString{
String: s[partStart:escIndex],
Style: style,
})
}
partStart = byteIndex + 1

url := s[urlStart : byteIndex-1]
style = style.WithHyperlink(&url)
state = initial
} else {
// Broken ending, just treat the whole thing as plain text
state = initial
}
continue
}

panic("We should never get here")
}

if partStart < len(s) {
// Consume the most recent part
parts = append(parts, _StyledString{
String: s[partStart:],
Style: style,
})
}

return styledStringsWithTrailer{
styledStrings: parts,
trailer: trailer,
}
}

// updateStyle parses a string of the form "ESC[33m" into changes to style
func updateStyle(style twin.Style, escapeSequence string) twin.Style {
numbers := strings.Split(escapeSequence[2:len(escapeSequence)-1], ";")
// rawUpdateStyle parses a string of the form "33m" into changes to style. This
// is what comes after ESC[ in an ANSI SGR sequence.
func rawUpdateStyle(style twin.Style, escapeSequenceWithoutHeader string) twin.Style {
numbers := strings.Split(escapeSequenceWithoutHeader[:len(escapeSequenceWithoutHeader)-1], ";")
index := 0
for index < len(numbers) {
number := numbers[index]
Expand Down
143 changes: 74 additions & 69 deletions m/ansiTokenizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,76 +28,78 @@ func cellsToPlainString(cells []twin.Cell) string {
// without logging any errors
func TestTokenize(t *testing.T) {
for _, fileName := range getTestFiles() {
file, err := os.Open(fileName)
if err != nil {
t.Errorf("Error opening file <%s>: %s", fileName, err.Error())
continue
}
defer func() {
if err := file.Close(); err != nil {
panic(err)
t.Run(fileName, func(t *testing.T) {
file, err := os.Open(fileName)
if err != nil {
t.Errorf("Error opening file <%s>: %s", fileName, err.Error())
return
}
}()

myReader := NewReaderFromStream(fileName, file)
for !myReader.done.Load() {
}
defer func() {
if err := file.Close(); err != nil {
panic(err)
}
}()

for lineNumber := 1; lineNumber <= myReader.GetLineCount(); lineNumber++ {
line := myReader.GetLine(lineNumber)
lineNumber++
myReader := NewReaderFromStream(fileName, file)
for !myReader.done.Load() {
}

var loglines strings.Builder
log.SetOutput(&loglines)
for lineNumber := 1; lineNumber <= myReader.GetLineCount(); lineNumber++ {
line := myReader.GetLine(lineNumber)
lineNumber++

tokens := cellsFromString(line.raw).Cells
plainString := withoutFormatting(line.raw)
if len(tokens) != utf8.RuneCountInString(plainString) {
t.Errorf("%s:%d: len(tokens)=%d, len(plainString)=%d for: <%s>",
fileName, lineNumber,
len(tokens), utf8.RuneCountInString(plainString), line.raw)
continue
}
var loglines strings.Builder
log.SetOutput(&loglines)

// Tokens and plain have the same lengths, compare contents
plainStringChars := []rune(plainString)
for index, plainChar := range plainStringChars {
cellChar := tokens[index]
if cellChar.Rune == plainChar {
tokens := cellsFromString(line.raw).Cells
plainString := withoutFormatting(line.raw)
if len(tokens) != utf8.RuneCountInString(plainString) {
t.Errorf("%s:%d: len(tokens)=%d, len(plainString)=%d for: <%s>",
fileName, lineNumber,
len(tokens), utf8.RuneCountInString(plainString), line.raw)
continue
}

if cellChar.Rune == '•' && plainChar == 'o' {
// Pretty bullets on man pages
continue
// Tokens and plain have the same lengths, compare contents
plainStringChars := []rune(plainString)
for index, plainChar := range plainStringChars {
cellChar := tokens[index]
if cellChar.Rune == plainChar {
continue
}

if cellChar.Rune == '•' && plainChar == 'o' {
// Pretty bullets on man pages
continue
}

// Chars mismatch!
plainStringFromCells := cellsToPlainString(tokens)
positionMarker := strings.Repeat(" ", index) + "^"
cellCharString := string(cellChar.Rune)
if !twin.Printable(cellChar.Rune) {
cellCharString = fmt.Sprint(int(cellChar.Rune))
}
plainCharString := string(plainChar)
if !twin.Printable(plainChar) {
plainCharString = fmt.Sprint(int(plainChar))
}
t.Errorf("%s:%d, 0-based column %d: cell char <%s> != plain char <%s>:\nPlain: %s\nCells: %s\n %s",
fileName, lineNumber, index,
cellCharString, plainCharString,
plainString,
plainStringFromCells,
positionMarker,
)
break
}

// Chars mismatch!
plainStringFromCells := cellsToPlainString(tokens)
positionMarker := strings.Repeat(" ", index) + "^"
cellCharString := string(cellChar.Rune)
if !twin.Printable(cellChar.Rune) {
cellCharString = fmt.Sprint(int(cellChar.Rune))
}
plainCharString := string(plainChar)
if !twin.Printable(plainChar) {
plainCharString = fmt.Sprint(int(plainChar))
if len(loglines.String()) != 0 {
t.Errorf("%s: %s", fileName, loglines.String())
continue
}
t.Errorf("%s:%d, 0-based column %d: cell char <%s> != plain char <%s>:\nPlain: %s\nCells: %s\n %s",
fileName, lineNumber, index,
cellCharString, plainCharString,
plainString,
plainStringFromCells,
positionMarker,
)
break
}

if len(loglines.String()) != 0 {
t.Errorf("%s: %s", fileName, loglines.String())
continue
}
}
})
}
}

Expand Down Expand Up @@ -229,8 +231,8 @@ func TestConsumeCompositeColorIncomplete24Bit(t *testing.T) {
assert.Assert(t, color == nil)
}

func TestUpdateStyle(t *testing.T) {
numberColored := updateStyle(twin.StyleDefault, "\x1b[33m")
func TestRawUpdateStyle(t *testing.T) {
numberColored := rawUpdateStyle(twin.StyleDefault, "33m")
assert.Equal(t, numberColored, twin.StyleDefault.Foreground(twin.NewColor16(3)))
}

Expand Down Expand Up @@ -287,15 +289,18 @@ func TestHyperlink_incomplete(t *testing.T) {
complete := "a\x1b]8;;X\x1b\\"

for l := len(complete) - 1; l >= 0; l-- {
tokens := cellsFromString(complete[:l]).Cells

for i := 0; i < l; i++ {
if complete[i] == '\x1b' {
// These get special rendering, if everything else matches
// that's good enough.
continue
incomplete := complete[:l]
t.Run(fmt.Sprintf("l=%d incomplete=<%s>", l, strings.ReplaceAll(incomplete, "\x1b", "ESC")), func(t *testing.T) {
tokens := cellsFromString(incomplete).Cells

for i := 0; i < l; i++ {
if complete[i] == '\x1b' {
// These get special rendering, if everything else matches
// that's good enough.
continue
}
assert.Equal(t, tokens[i], twin.Cell{Rune: rune(complete[i]), Style: twin.StyleDefault})
}
assert.Equal(t, tokens[i], twin.Cell{Rune: rune(complete[i]), Style: twin.StyleDefault})
}
})
}
}
Loading

0 comments on commit cfdf99d

Please sign in to comment.