Merge branch 'johan/functional-ssfs'

This merge splits the styled string splitter into multiple functions for readability and maintainability. This makes the BenchmarkHighlightedSearch about 10% slower, but I think it's worth the tradeoff.
walles · Nov 11, 2023 · cfdf99d · cfdf99d
2 parents c4e9437 + 3676bbf
commit cfdf99d
Show file tree

Hide file tree

Showing 4 changed files with 342 additions and 257 deletions.
diff --git a/m/ansiTokenizer.go b/m/ansiTokenizer.go
@@ -424,194 +424,10 @@ type _StyledString struct {
 	Style  twin.Style
 }
 
-type parseState int
-
-const (
-	initial parseState = iota
-	justSawEsc
-	inStyle
-	gotOsc      // OSC = Operating System Command = ESC]
-	gotOsc8     // ESC]8
-	gotOsc8Semi // ESC]8;
-	inUrl       // After ESC]8;;
-	inUrlGotEsc // Expecting a \ now to terminate the URL
-)
-
-func styledStringsFromString(s string) styledStringsWithTrailer {
-	if !strings.ContainsAny(s, "\x1b") {
-		// This shortcut makes BenchmarkPlainTextSearch() perform a lot better
-		return styledStringsWithTrailer{
-			trailer: twin.StyleDefault,
-			styledStrings: []_StyledString{{
-				String: s,
-				Style:  twin.StyleDefault,
-			}},
-		}
-	}
-
-	trailer := twin.StyleDefault
-	parts := make([]_StyledString, 1)
-
-	state := initial
-	escIndex := -1 // Byte index into s
-	partStart := 0 // Byte index into s
-	urlStart := -1 // Byte index into s
-	style := twin.StyleDefault
-	for byteIndex, char := range s {
-		if state == initial {
-			if char == '\x1b' {
-				escIndex = byteIndex
-				state = justSawEsc
-			}
-			continue
-		} else if state == justSawEsc {
-			if char == '\x1b' {
-				escIndex = byteIndex
-				state = justSawEsc
-			} else if char == '[' {
-				state = inStyle
-			} else if char == ']' {
-				state = gotOsc
-			} else {
-				state = initial
-			}
-			continue
-		} else if state == inStyle {
-			if char == '\x1b' {
-				escIndex = byteIndex
-				state = justSawEsc
-			} else if (char >= '0' && char <= '9') || char == ';' {
-				// Stay in style
-			} else if char == 'm' {
-				if partStart < escIndex {
-					// Consume the most recent part
-					parts = append(parts, _StyledString{
-						String: s[partStart:escIndex],
-						Style:  style,
-					})
-				}
-
-				style = updateStyle(style, s[escIndex:byteIndex+1])
-				partStart = byteIndex + 1 // Next part starts after this 'm'
-				state = initial
-			} else if char == 'K' {
-				ansiStyle := s[escIndex : byteIndex+1]
-				if ansiStyle != "\x1b[K" && ansiStyle != "\x1b[0K" {
-					// Not a supported clear operation, just treat the whole thing as plain text
-					state = initial
-					continue
-				}
-
-				// Handle clear-to-end-of-line
-
-				if partStart < escIndex {
-					// Consume the most recent part
-					parts = append(parts, _StyledString{
-						String: s[partStart:escIndex],
-						Style:  style,
-					})
-				}
-
-				trailer = style
-				partStart = byteIndex + 1 // Next part starts after this 'K'
-				state = initial
-			} else {
-				// Unsupported sequence, just treat the whole thing as plain text
-				state = initial
-			}
-			continue
-		} else if state == gotOsc {
-			if char == '8' {
-				state = gotOsc8
-			} else {
-				state = initial
-			}
-			continue
-		} else if state == gotOsc8 {
-			if char == ';' {
-				state = gotOsc8Semi
-			} else {
-				state = initial
-			}
-			continue
-		} else if state == gotOsc8Semi {
-			if char == ';' {
-				urlStart = byteIndex + 1
-				state = inUrl
-			} else {
-				state = initial
-			}
-			continue
-		} else if state == inUrl {
-			// Ref: https://stackoverflow.com/a/1547940/473672
-			const validChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;="
-			if char == '\x1b' {
-				state = inUrlGotEsc
-			} else if char == '\x07' {
-				// End of URL
-
-				if partStart < escIndex {
-					// Consume the most recent part
-					parts = append(parts, _StyledString{
-						String: s[partStart:escIndex],
-						Style:  style,
-					})
-				}
-				partStart = byteIndex + 1
-
-				url := s[urlStart:byteIndex]
-				style = style.WithHyperlink(&url)
-				state = initial
-			} else if strings.ContainsRune(validChars, char) {
-				// Stay in URL
-			} else {
-				// Invalid URL character, just treat the whole thing as plain text
-				state = initial
-			}
-			continue
-		} else if state == inUrlGotEsc {
-			if char == '\\' {
-				// End of URL
-
-				if partStart < escIndex {
-					// Consume the most recent part
-					parts = append(parts, _StyledString{
-						String: s[partStart:escIndex],
-						Style:  style,
-					})
-				}
-				partStart = byteIndex + 1
-
-				url := s[urlStart : byteIndex-1]
-				style = style.WithHyperlink(&url)
-				state = initial
-			} else {
-				// Broken ending, just treat the whole thing as plain text
-				state = initial
-			}
-			continue
-		}
-
-		panic("We should never get here")
-	}
-
-	if partStart < len(s) {
-		// Consume the most recent part
-		parts = append(parts, _StyledString{
-			String: s[partStart:],
-			Style:  style,
-		})
-	}
-
-	return styledStringsWithTrailer{
-		styledStrings: parts,
-		trailer:       trailer,
-	}
-}
-
-// updateStyle parses a string of the form "ESC[33m" into changes to style
-func updateStyle(style twin.Style, escapeSequence string) twin.Style {
-	numbers := strings.Split(escapeSequence[2:len(escapeSequence)-1], ";")
+// rawUpdateStyle parses a string of the form "33m" into changes to style. This
+// is what comes after ESC[ in an ANSI SGR sequence.
+func rawUpdateStyle(style twin.Style, escapeSequenceWithoutHeader string) twin.Style {
+	numbers := strings.Split(escapeSequenceWithoutHeader[:len(escapeSequenceWithoutHeader)-1], ";")
 	index := 0
 	for index < len(numbers) {
 		number := numbers[index]

diff --git a/m/ansiTokenizer_test.go b/m/ansiTokenizer_test.go
@@ -28,76 +28,78 @@ func cellsToPlainString(cells []twin.Cell) string {
 // without logging any errors
 func TestTokenize(t *testing.T) {
 	for _, fileName := range getTestFiles() {
-		file, err := os.Open(fileName)
-		if err != nil {
-			t.Errorf("Error opening file <%s>: %s", fileName, err.Error())
-			continue
-		}
-		defer func() {
-			if err := file.Close(); err != nil {
-				panic(err)
+		t.Run(fileName, func(t *testing.T) {
+			file, err := os.Open(fileName)
+			if err != nil {
+				t.Errorf("Error opening file <%s>: %s", fileName, err.Error())
+				return
 			}
-		}()
-
-		myReader := NewReaderFromStream(fileName, file)
-		for !myReader.done.Load() {
-		}
+			defer func() {
+				if err := file.Close(); err != nil {
+					panic(err)
+				}
+			}()
 
-		for lineNumber := 1; lineNumber <= myReader.GetLineCount(); lineNumber++ {
-			line := myReader.GetLine(lineNumber)
-			lineNumber++
+			myReader := NewReaderFromStream(fileName, file)
+			for !myReader.done.Load() {
+			}
 
-			var loglines strings.Builder
-			log.SetOutput(&loglines)
+			for lineNumber := 1; lineNumber <= myReader.GetLineCount(); lineNumber++ {
+				line := myReader.GetLine(lineNumber)
+				lineNumber++
 
-			tokens := cellsFromString(line.raw).Cells
-			plainString := withoutFormatting(line.raw)
-			if len(tokens) != utf8.RuneCountInString(plainString) {
-				t.Errorf("%s:%d: len(tokens)=%d, len(plainString)=%d for: <%s>",
-					fileName, lineNumber,
-					len(tokens), utf8.RuneCountInString(plainString), line.raw)
-				continue
-			}
+				var loglines strings.Builder
+				log.SetOutput(&loglines)
 
-			// Tokens and plain have the same lengths, compare contents
-			plainStringChars := []rune(plainString)
-			for index, plainChar := range plainStringChars {
-				cellChar := tokens[index]
-				if cellChar.Rune == plainChar {
+				tokens := cellsFromString(line.raw).Cells
+				plainString := withoutFormatting(line.raw)
+				if len(tokens) != utf8.RuneCountInString(plainString) {
+					t.Errorf("%s:%d: len(tokens)=%d, len(plainString)=%d for: <%s>",
+						fileName, lineNumber,
+						len(tokens), utf8.RuneCountInString(plainString), line.raw)
 					continue
 				}
 
-				if cellChar.Rune == '•' && plainChar == 'o' {
-					// Pretty bullets on man pages
-					continue
+				// Tokens and plain have the same lengths, compare contents
+				plainStringChars := []rune(plainString)
+				for index, plainChar := range plainStringChars {
+					cellChar := tokens[index]
+					if cellChar.Rune == plainChar {
+						continue
+					}
+
+					if cellChar.Rune == '•' && plainChar == 'o' {
+						// Pretty bullets on man pages
+						continue
+					}
+
+					// Chars mismatch!
+					plainStringFromCells := cellsToPlainString(tokens)
+					positionMarker := strings.Repeat(" ", index) + "^"
+					cellCharString := string(cellChar.Rune)
+					if !twin.Printable(cellChar.Rune) {
+						cellCharString = fmt.Sprint(int(cellChar.Rune))
+					}
+					plainCharString := string(plainChar)
+					if !twin.Printable(plainChar) {
+						plainCharString = fmt.Sprint(int(plainChar))
+					}
+					t.Errorf("%s:%d, 0-based column %d: cell char <%s> != plain char <%s>:\nPlain: %s\nCells: %s\n       %s",
+						fileName, lineNumber, index,
+						cellCharString, plainCharString,
+						plainString,
+						plainStringFromCells,
+						positionMarker,
+					)
+					break
 				}
 
-				// Chars mismatch!
-				plainStringFromCells := cellsToPlainString(tokens)
-				positionMarker := strings.Repeat(" ", index) + "^"
-				cellCharString := string(cellChar.Rune)
-				if !twin.Printable(cellChar.Rune) {
-					cellCharString = fmt.Sprint(int(cellChar.Rune))
-				}
-				plainCharString := string(plainChar)
-				if !twin.Printable(plainChar) {
-					plainCharString = fmt.Sprint(int(plainChar))
+				if len(loglines.String()) != 0 {
+					t.Errorf("%s: %s", fileName, loglines.String())
+					continue
 				}
-				t.Errorf("%s:%d, 0-based column %d: cell char <%s> != plain char <%s>:\nPlain: %s\nCells: %s\n       %s",
-					fileName, lineNumber, index,
-					cellCharString, plainCharString,
-					plainString,
-					plainStringFromCells,
-					positionMarker,
-				)
-				break
-			}
-
-			if len(loglines.String()) != 0 {
-				t.Errorf("%s: %s", fileName, loglines.String())
-				continue
 			}
-		}
+		})
 	}
 }
 
@@ -229,8 +231,8 @@ func TestConsumeCompositeColorIncomplete24Bit(t *testing.T) {
 	assert.Assert(t, color == nil)
 }
 
-func TestUpdateStyle(t *testing.T) {
-	numberColored := updateStyle(twin.StyleDefault, "\x1b[33m")
+func TestRawUpdateStyle(t *testing.T) {
+	numberColored := rawUpdateStyle(twin.StyleDefault, "33m")
 	assert.Equal(t, numberColored, twin.StyleDefault.Foreground(twin.NewColor16(3)))
 }
 
@@ -287,15 +289,18 @@ func TestHyperlink_incomplete(t *testing.T) {
 	complete := "a\x1b]8;;X\x1b\\"
 
 	for l := len(complete) - 1; l >= 0; l-- {
-		tokens := cellsFromString(complete[:l]).Cells
-
-		for i := 0; i < l; i++ {
-			if complete[i] == '\x1b' {
-				// These get special rendering, if everything else matches
-				// that's good enough.
-				continue
+		incomplete := complete[:l]
+		t.Run(fmt.Sprintf("l=%d incomplete=<%s>", l, strings.ReplaceAll(incomplete, "\x1b", "ESC")), func(t *testing.T) {
+			tokens := cellsFromString(incomplete).Cells
+
+			for i := 0; i < l; i++ {
+				if complete[i] == '\x1b' {
+					// These get special rendering, if everything else matches
+					// that's good enough.
+					continue
+				}
+				assert.Equal(t, tokens[i], twin.Cell{Rune: rune(complete[i]), Style: twin.StyleDefault})
 			}
-			assert.Equal(t, tokens[i], twin.Cell{Rune: rune(complete[i]), Style: twin.StyleDefault})
-		}
+		})
 	}
 }