diff --git a/m/ansiTokenizer.go b/m/ansiTokenizer.go index 6e02b1d6..2da9cfe4 100644 --- a/m/ansiTokenizer.go +++ b/m/ansiTokenizer.go @@ -424,194 +424,10 @@ type _StyledString struct { Style twin.Style } -type parseState int - -const ( - initial parseState = iota - justSawEsc - inStyle - gotOsc // OSC = Operating System Command = ESC] - gotOsc8 // ESC]8 - gotOsc8Semi // ESC]8; - inUrl // After ESC]8;; - inUrlGotEsc // Expecting a \ now to terminate the URL -) - -func styledStringsFromString(s string) styledStringsWithTrailer { - if !strings.ContainsAny(s, "\x1b") { - // This shortcut makes BenchmarkPlainTextSearch() perform a lot better - return styledStringsWithTrailer{ - trailer: twin.StyleDefault, - styledStrings: []_StyledString{{ - String: s, - Style: twin.StyleDefault, - }}, - } - } - - trailer := twin.StyleDefault - parts := make([]_StyledString, 1) - - state := initial - escIndex := -1 // Byte index into s - partStart := 0 // Byte index into s - urlStart := -1 // Byte index into s - style := twin.StyleDefault - for byteIndex, char := range s { - if state == initial { - if char == '\x1b' { - escIndex = byteIndex - state = justSawEsc - } - continue - } else if state == justSawEsc { - if char == '\x1b' { - escIndex = byteIndex - state = justSawEsc - } else if char == '[' { - state = inStyle - } else if char == ']' { - state = gotOsc - } else { - state = initial - } - continue - } else if state == inStyle { - if char == '\x1b' { - escIndex = byteIndex - state = justSawEsc - } else if (char >= '0' && char <= '9') || char == ';' { - // Stay in style - } else if char == 'm' { - if partStart < escIndex { - // Consume the most recent part - parts = append(parts, _StyledString{ - String: s[partStart:escIndex], - Style: style, - }) - } - - style = updateStyle(style, s[escIndex:byteIndex+1]) - partStart = byteIndex + 1 // Next part starts after this 'm' - state = initial - } else if char == 'K' { - ansiStyle := s[escIndex : byteIndex+1] - if ansiStyle != "\x1b[K" && ansiStyle != "\x1b[0K" { - // Not a supported clear operation, just treat the whole thing as plain text - state = initial - continue - } - - // Handle clear-to-end-of-line - - if partStart < escIndex { - // Consume the most recent part - parts = append(parts, _StyledString{ - String: s[partStart:escIndex], - Style: style, - }) - } - - trailer = style - partStart = byteIndex + 1 // Next part starts after this 'K' - state = initial - } else { - // Unsupported sequence, just treat the whole thing as plain text - state = initial - } - continue - } else if state == gotOsc { - if char == '8' { - state = gotOsc8 - } else { - state = initial - } - continue - } else if state == gotOsc8 { - if char == ';' { - state = gotOsc8Semi - } else { - state = initial - } - continue - } else if state == gotOsc8Semi { - if char == ';' { - urlStart = byteIndex + 1 - state = inUrl - } else { - state = initial - } - continue - } else if state == inUrl { - // Ref: https://stackoverflow.com/a/1547940/473672 - const validChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;=" - if char == '\x1b' { - state = inUrlGotEsc - } else if char == '\x07' { - // End of URL - - if partStart < escIndex { - // Consume the most recent part - parts = append(parts, _StyledString{ - String: s[partStart:escIndex], - Style: style, - }) - } - partStart = byteIndex + 1 - - url := s[urlStart:byteIndex] - style = style.WithHyperlink(&url) - state = initial - } else if strings.ContainsRune(validChars, char) { - // Stay in URL - } else { - // Invalid URL character, just treat the whole thing as plain text - state = initial - } - continue - } else if state == inUrlGotEsc { - if char == '\\' { - // End of URL - - if partStart < escIndex { - // Consume the most recent part - parts = append(parts, _StyledString{ - String: s[partStart:escIndex], - Style: style, - }) - } - partStart = byteIndex + 1 - - url := s[urlStart : byteIndex-1] - style = style.WithHyperlink(&url) - state = initial - } else { - // Broken ending, just treat the whole thing as plain text - state = initial - } - continue - } - - panic("We should never get here") - } - - if partStart < len(s) { - // Consume the most recent part - parts = append(parts, _StyledString{ - String: s[partStart:], - Style: style, - }) - } - - return styledStringsWithTrailer{ - styledStrings: parts, - trailer: trailer, - } -} - -// updateStyle parses a string of the form "ESC[33m" into changes to style -func updateStyle(style twin.Style, escapeSequence string) twin.Style { - numbers := strings.Split(escapeSequence[2:len(escapeSequence)-1], ";") +// rawUpdateStyle parses a string of the form "33m" into changes to style. This +// is what comes after ESC[ in an ANSI SGR sequence. +func rawUpdateStyle(style twin.Style, escapeSequenceWithoutHeader string) twin.Style { + numbers := strings.Split(escapeSequenceWithoutHeader[:len(escapeSequenceWithoutHeader)-1], ";") index := 0 for index < len(numbers) { number := numbers[index] diff --git a/m/ansiTokenizer_test.go b/m/ansiTokenizer_test.go index a54fe987..786b2579 100644 --- a/m/ansiTokenizer_test.go +++ b/m/ansiTokenizer_test.go @@ -28,76 +28,78 @@ func cellsToPlainString(cells []twin.Cell) string { // without logging any errors func TestTokenize(t *testing.T) { for _, fileName := range getTestFiles() { - file, err := os.Open(fileName) - if err != nil { - t.Errorf("Error opening file <%s>: %s", fileName, err.Error()) - continue - } - defer func() { - if err := file.Close(); err != nil { - panic(err) + t.Run(fileName, func(t *testing.T) { + file, err := os.Open(fileName) + if err != nil { + t.Errorf("Error opening file <%s>: %s", fileName, err.Error()) + return } - }() - - myReader := NewReaderFromStream(fileName, file) - for !myReader.done.Load() { - } + defer func() { + if err := file.Close(); err != nil { + panic(err) + } + }() - for lineNumber := 1; lineNumber <= myReader.GetLineCount(); lineNumber++ { - line := myReader.GetLine(lineNumber) - lineNumber++ + myReader := NewReaderFromStream(fileName, file) + for !myReader.done.Load() { + } - var loglines strings.Builder - log.SetOutput(&loglines) + for lineNumber := 1; lineNumber <= myReader.GetLineCount(); lineNumber++ { + line := myReader.GetLine(lineNumber) + lineNumber++ - tokens := cellsFromString(line.raw).Cells - plainString := withoutFormatting(line.raw) - if len(tokens) != utf8.RuneCountInString(plainString) { - t.Errorf("%s:%d: len(tokens)=%d, len(plainString)=%d for: <%s>", - fileName, lineNumber, - len(tokens), utf8.RuneCountInString(plainString), line.raw) - continue - } + var loglines strings.Builder + log.SetOutput(&loglines) - // Tokens and plain have the same lengths, compare contents - plainStringChars := []rune(plainString) - for index, plainChar := range plainStringChars { - cellChar := tokens[index] - if cellChar.Rune == plainChar { + tokens := cellsFromString(line.raw).Cells + plainString := withoutFormatting(line.raw) + if len(tokens) != utf8.RuneCountInString(plainString) { + t.Errorf("%s:%d: len(tokens)=%d, len(plainString)=%d for: <%s>", + fileName, lineNumber, + len(tokens), utf8.RuneCountInString(plainString), line.raw) continue } - if cellChar.Rune == '•' && plainChar == 'o' { - // Pretty bullets on man pages - continue + // Tokens and plain have the same lengths, compare contents + plainStringChars := []rune(plainString) + for index, plainChar := range plainStringChars { + cellChar := tokens[index] + if cellChar.Rune == plainChar { + continue + } + + if cellChar.Rune == '•' && plainChar == 'o' { + // Pretty bullets on man pages + continue + } + + // Chars mismatch! + plainStringFromCells := cellsToPlainString(tokens) + positionMarker := strings.Repeat(" ", index) + "^" + cellCharString := string(cellChar.Rune) + if !twin.Printable(cellChar.Rune) { + cellCharString = fmt.Sprint(int(cellChar.Rune)) + } + plainCharString := string(plainChar) + if !twin.Printable(plainChar) { + plainCharString = fmt.Sprint(int(plainChar)) + } + t.Errorf("%s:%d, 0-based column %d: cell char <%s> != plain char <%s>:\nPlain: %s\nCells: %s\n %s", + fileName, lineNumber, index, + cellCharString, plainCharString, + plainString, + plainStringFromCells, + positionMarker, + ) + break } - // Chars mismatch! - plainStringFromCells := cellsToPlainString(tokens) - positionMarker := strings.Repeat(" ", index) + "^" - cellCharString := string(cellChar.Rune) - if !twin.Printable(cellChar.Rune) { - cellCharString = fmt.Sprint(int(cellChar.Rune)) - } - plainCharString := string(plainChar) - if !twin.Printable(plainChar) { - plainCharString = fmt.Sprint(int(plainChar)) + if len(loglines.String()) != 0 { + t.Errorf("%s: %s", fileName, loglines.String()) + continue } - t.Errorf("%s:%d, 0-based column %d: cell char <%s> != plain char <%s>:\nPlain: %s\nCells: %s\n %s", - fileName, lineNumber, index, - cellCharString, plainCharString, - plainString, - plainStringFromCells, - positionMarker, - ) - break - } - - if len(loglines.String()) != 0 { - t.Errorf("%s: %s", fileName, loglines.String()) - continue } - } + }) } } @@ -229,8 +231,8 @@ func TestConsumeCompositeColorIncomplete24Bit(t *testing.T) { assert.Assert(t, color == nil) } -func TestUpdateStyle(t *testing.T) { - numberColored := updateStyle(twin.StyleDefault, "\x1b[33m") +func TestRawUpdateStyle(t *testing.T) { + numberColored := rawUpdateStyle(twin.StyleDefault, "33m") assert.Equal(t, numberColored, twin.StyleDefault.Foreground(twin.NewColor16(3))) } @@ -287,15 +289,18 @@ func TestHyperlink_incomplete(t *testing.T) { complete := "a\x1b]8;;X\x1b\\" for l := len(complete) - 1; l >= 0; l-- { - tokens := cellsFromString(complete[:l]).Cells - - for i := 0; i < l; i++ { - if complete[i] == '\x1b' { - // These get special rendering, if everything else matches - // that's good enough. - continue + incomplete := complete[:l] + t.Run(fmt.Sprintf("l=%d incomplete=<%s>", l, strings.ReplaceAll(incomplete, "\x1b", "ESC")), func(t *testing.T) { + tokens := cellsFromString(incomplete).Cells + + for i := 0; i < l; i++ { + if complete[i] == '\x1b' { + // These get special rendering, if everything else matches + // that's good enough. + continue + } + assert.Equal(t, tokens[i], twin.Cell{Rune: rune(complete[i]), Style: twin.StyleDefault}) } - assert.Equal(t, tokens[i], twin.Cell{Rune: rune(complete[i]), Style: twin.StyleDefault}) - } + }) } } diff --git a/m/styledStringSplitter.go b/m/styledStringSplitter.go new file mode 100644 index 00000000..23dca4e8 --- /dev/null +++ b/m/styledStringSplitter.go @@ -0,0 +1,237 @@ +package m + +import ( + "strings" + "unicode/utf8" + + "github.com/walles/moar/twin" +) + +const esc = '\x1b' + +type styledStringSplitter struct { + input string + nextByteIndex int + previousByteIndex int + + inProgressString strings.Builder + inProgressStyle twin.Style + + parts []_StyledString + trailer twin.Style +} + +func styledStringsFromString(s string) styledStringsWithTrailer { + if !strings.ContainsAny(s, "\x1b") { + // This shortcut makes BenchmarkPlainTextSearch() perform a lot better + return styledStringsWithTrailer{ + trailer: twin.StyleDefault, + styledStrings: []_StyledString{{ + String: s, + Style: twin.StyleDefault, + }}, + } + } + + splitter := styledStringSplitter{ + input: s, + } + splitter.run() + + return styledStringsWithTrailer{ + trailer: splitter.trailer, + styledStrings: splitter.parts, + } +} + +func (s *styledStringSplitter) nextChar() rune { + if s.nextByteIndex >= len(s.input) { + s.previousByteIndex = s.nextByteIndex + return -1 + } + + char, size := utf8.DecodeRuneInString(s.input[s.nextByteIndex:]) + s.previousByteIndex = s.nextByteIndex + s.nextByteIndex += size + return char +} + +// Returns whatever the last call to nextChar() returned +func (s *styledStringSplitter) lastChar() rune { + if s.previousByteIndex >= len(s.input) { + return -1 + } + + char, _ := utf8.DecodeRuneInString(s.input[s.previousByteIndex:]) + return char +} + +func (s *styledStringSplitter) run() { + char := s.nextChar() + for { + if char == -1 { + s.finalizeCurrentPart() + return + } + + if char == esc { + escIndex := s.previousByteIndex + success := s.handleEscape() + if !success { + // Somewhere in handleEscape(), we got a character that was + // unexpected. We need to treat everything up to before that + // character as just plain runes. + for _, char := range s.input[escIndex:s.previousByteIndex] { + s.handleRune(char) + } + + // Start over with the character that caused the problem + char = s.lastChar() + continue + } + } else { + s.handleRune(char) + } + + char = s.nextChar() + } +} + +func (s *styledStringSplitter) handleRune(char rune) { + s.inProgressString.WriteRune(char) +} + +func (s *styledStringSplitter) handleEscape() bool { + char := s.nextChar() + if char == '[' || char == ']' { + // Got the start of a CSI or an OSC sequence + return s.consumeControlSequence(char) + } + + return false +} + +func (s *styledStringSplitter) consumeControlSequence(charAfterEsc rune) bool { + // Points to right after "ESC[" + startIndex := s.nextByteIndex + + // We're looking for a letter to end the CSI sequence + for { + char := s.nextChar() + if char == -1 { + return false + } + + if char == ';' || (char >= '0' && char <= '9') { + // Sequence still in progress + + if s.input[startIndex:s.nextByteIndex] == "8;;" { + // Special case, here comes the URL + return s.handleUrl() + } + + continue + } + + // The end, handle what we got + endIndexExclusive := s.nextByteIndex + return s.handleCompleteControlSequence(charAfterEsc, s.input[startIndex:endIndexExclusive]) + } +} + +// If the whole CSI sequence is ESC[33m, you should call this function with just +// "33m". +func (s *styledStringSplitter) handleCompleteControlSequence(charAfterEsc rune, sequence string) bool { + if charAfterEsc != '[' { + return false + } + + if sequence == "K" || sequence == "0K" { + // Clear to end of line + s.trailer = s.inProgressStyle + return true + } + + lastChar := sequence[len(sequence)-1] + if lastChar == 'm' { + newStyle := rawUpdateStyle(s.inProgressStyle, sequence) + s.startNewPart(newStyle) + return true + } + + return false +} + +// We just got ESC]8; and should now read the URL. URLs end with ASCII 7 BEL or ESC \. +func (s *styledStringSplitter) handleUrl() bool { + // Valid URL characters. + // Ref: https://stackoverflow.com/a/1547940/473672 + const validChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;=" + + // Points to right after "ESC]8;" + urlStartIndex := s.nextByteIndex + + justSawEsc := false + for { + char := s.nextChar() + if char == -1 { + return false + } + + if justSawEsc { + if char != '\\' { + return false + } + + // End of URL + urlEndIndexExclusive := s.nextByteIndex - 2 + url := s.input[urlStartIndex:urlEndIndexExclusive] + s.startNewPart(s.inProgressStyle.WithHyperlink(&url)) + return true + } + + // Invariant: justSawEsc == false + + if char == esc { + justSawEsc = true + continue + } + + if char == '\x07' { + // End of URL + urlEndIndexExclusive := s.nextByteIndex - 1 + url := s.input[urlStartIndex:urlEndIndexExclusive] + s.startNewPart(s.inProgressStyle.WithHyperlink(&url)) + return true + } + + if !strings.ContainsRune(validChars, char) { + return false + } + + // It's a valid URL char, keep going + } +} + +func (s *styledStringSplitter) startNewPart(style twin.Style) { + if style == s.inProgressStyle { + // No need to start a new part + return + } + + s.finalizeCurrentPart() + s.inProgressString.Reset() + s.inProgressStyle = style +} + +func (s *styledStringSplitter) finalizeCurrentPart() { + if s.inProgressString.Len() == 0 { + // Nothing to do + return + } + + s.parts = append(s.parts, _StyledString{ + String: s.inProgressString.String(), + Style: s.inProgressStyle, + }) +} diff --git a/m/styledStringSplitter_test.go b/m/styledStringSplitter_test.go new file mode 100644 index 00000000..e0e26c2e --- /dev/null +++ b/m/styledStringSplitter_test.go @@ -0,0 +1,27 @@ +package m + +import ( + "testing" + + "gotest.tools/v3/assert" +) + +func TestNextCharLastChar_base(t *testing.T) { + s := styledStringSplitter{ + input: "a", + } + + assert.Equal(t, 'a', s.nextChar()) + assert.Equal(t, 'a', s.lastChar()) + assert.Equal(t, rune(-1), s.nextChar()) + assert.Equal(t, rune(-1), s.lastChar()) +} + +func TestNextCharLastChar_empty(t *testing.T) { + s := styledStringSplitter{ + input: "", + } + + assert.Equal(t, rune(-1), s.nextChar()) + assert.Equal(t, rune(-1), s.lastChar()) +}