kepub: Process replacements while rendering the content

By using golang.org/x/text/transform instead of rendering to a temporary buffer and replacing in that, we can handle the find/replace option more efficiently.
pgaskin · Jul 2, 2021 · f35a7bf · f35a7bf
1 parent 7848002
commit f35a7bf
Show file tree

Hide file tree

Showing 5 changed files with 476 additions and 41 deletions.
diff --git a/go.mod b/go.mod
@@ -16,6 +16,7 @@ require (
 	github.com/pgaskin/kepubify/_/go116-zip.go117 v0.0.0-20210611152744-2d89b3182523
 	github.com/pgaskin/kepubify/_/html v0.0.0-20210611145339-337924fbbaf0
 	golang.org/x/sync v0.0.0-20201008141435-b3e1573b7520
+	golang.org/x/text v0.3.6
 )
 
 require github.com/hexops/gotextdiff v1.0.3
diff --git a/go.sum b/go.sum
@@ -18,5 +18,6 @@ github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
 github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
 golang.org/x/sync v0.0.0-20201008141435-b3e1573b7520 h1:Bx6FllMpG4NWDOfhMBz1VR2QYNp/SAOHPIAsaVmxfPo=
 golang.org/x/sync v0.0.0-20201008141435-b3e1573b7520/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/text v0.3.6 h1:aRYxNxv6iGQlyVaZmk6ZgYEDa+Jg18DxebPSrd6bg1M=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
diff --git a/kepub/kepub.go b/kepub/kepub.go
@@ -10,8 +10,7 @@ type Converter struct {
 	// smart punctuation
 	smartypants bool
 
-	// find/replace in raw html output (note: inefficient, but more efficient
-	// than working with strings)
+	// find/replace in raw html output
 	find    [][]byte
 	replace [][]byte
 }
@@ -40,9 +39,7 @@ func ConverterOptionSmartypants() ConverterOption {
 	}
 }
 
-// ConverterOptionFindReplace replaces a raw string in the transformed HTML
-// (note that this impacts performance since it requires an additional temporary
-// buffer to be created for each document).
+// ConverterOptionFindReplace replaces a raw string in the transformed HTML.
 func ConverterOptionFindReplace(find, replace string) ConverterOption {
 	return func(c *Converter) {
 		c.find = append(c.find, []byte(find))

diff --git a/kepub/transform.go b/kepub/transform.go
@@ -5,13 +5,14 @@ import (
 	"fmt"
 	"io"
 	"path"
-	"regexp"
 	"strconv"
 	"strings"
 	"unicode"
+	"unicode/utf8"
 
 	"github.com/beevik/etree"
 	"github.com/kr/smartypants"
+	"golang.org/x/text/transform"
 
 	"github.com/pgaskin/kepubify/_/html/golang.org/x/net/html"
 	"github.com/pgaskin/kepubify/_/html/golang.org/x/net/html/atom"
@@ -173,18 +174,9 @@ func (c *Converter) TransformContent(w io.Writer, r io.Reader) error {
 	transformContentClean(doc)
 
 	if len(c.find) != 0 {
-		buf := bytes.NewBuffer(nil)
-		if err := html.RenderWithOptions(buf, doc,
-			html.RenderOptionAllowXMLDeclarations(true),
-			html.RenderOptionPolyglot(true)); err != nil {
-			return err
-		}
-		b := buf.Bytes()
-		for i := range c.find {
-			b = bytes.ReplaceAll(b, c.find[i], c.replace[i])
-		}
-		_, err := w.Write(b)
-		return err
+		wc := transformContentReplacements(w, c.find, c.replace)
+		w = wc
+		defer wc.Close()
 	}
 
 	err = html.RenderWithOptions(w, doc,
@@ -241,8 +233,6 @@ func transformContentKoboDivs(doc *html.Node) {
 	}
 }
 
-var sentencere = regexp.MustCompile(`((?ms).*?[\.\!\?]['"”’“…]?\s+)`)
-
 func transformContentKoboSpans(doc *html.Node) {
 	// behavior matches Kobo (checked with 3 books) as of 2020-01-12
 	if findClass(findAtom(doc, atom.Body), "koboSpan") != nil {
@@ -256,25 +246,13 @@ func transformContentKoboSpans(doc *html.Node) {
 	var cur *html.Node
 	stack = append(stack, findAtom(doc, atom.Body))
 
+	sentences := make([]string, 0, 8)
+
 	for len(stack) != 0 {
 		stack, cur = stack[:len(stack)-1], stack[len(stack)-1]
 		switch cur.Type {
 		case html.TextNode:
-			// split after each sentence (matches Kobo's behavior, and can't leave anything behind)
-			var sentences []string
-			if matches := sentencere.FindAllStringIndex(cur.Data, -1); len(matches) == 0 {
-				sentences = []string{cur.Data} // nothing matched, use the whole string
-			} else {
-				var pos int
-				sentences = make([]string, len(matches))
-				for i, match := range matches {
-					sentences[i] = cur.Data[pos:match[1]] // end of last match to end of the current one
-					pos = match[1]
-				}
-				if len(cur.Data) > pos {
-					sentences = append(sentences, cur.Data[pos:]) // rest of the string, if any
-				}
-			}
+			sentences = splitSentences(cur.Data, sentences[:0])
 
 			// wrap each sentence in a span (don't wrap whitespace unless it is
 			// directly under a P tag [TODO: are there any other cases we wrap
@@ -343,6 +321,158 @@ func transformContentKoboSpans(doc *html.Node) {
 	}
 }
 
+// splitSentences splits the string into sentences using the rules for creating
+// koboSpans. To make this zero-allocation, pass a zero-length slice for
+// splitSentences to take ownership of. To re-use the slice, pass the returned
+// slice, sliced to zero. If the slice is too small, it will be grown, causing
+// an allocation.
+//
+// This state-machine based implementation is around three times as fast as the
+// regexp-based one on average, and even faster when pre-allocating and re-using
+// the sentences slice. It should have the same output. For the original
+// implementation, see splitSentencesRegexp in the tests.
+func splitSentences(str string, sentences []string) []string {
+	const (
+		InputPunct   = iota // sentence-terminating punctuation
+		InputExtra          // additional punctuation (one is optionionally consumed after punct if present)
+		InputSpace          // whitespace
+		InputAny            // any valid rune not previously matched
+		InputInvalid        // an invalid byte
+		InputEOS            // end-of-string
+	)
+	const (
+		OutputNone = iota // moves to the next rune.
+		OutputNext        // adds everything from the last call up to (but not including) the current rune, and moves to the next rune.
+		OutputRest        // adds everything not yet added by OutputNext (state must be -1)
+	)
+	const (
+		StateDefault         = iota // in a sentence
+		StateAfterPunct             // after the sentence-terminating rune
+		StateAfterPunctExtra        // after the optional additional punctuation rune
+		StateAfterSpace             // the trailing whitespace after the sentence
+	)
+
+	if sentences == nil {
+		sentences = make([]string, 0, 4) // pre-allocate some room
+	}
+
+	for i, state := 0, 0; state != -1; {
+		x, z := utf8.DecodeRuneInString(str[i:])
+
+		var input int
+		switch x {
+		case utf8.RuneError:
+			switch z {
+			case 0:
+				input = InputEOS
+			default:
+				input = InputInvalid
+			}
+		case '.', '!', '?':
+			input = InputPunct
+		case '\'', '"', '”', '’', '“', '…':
+			input = InputExtra
+		case '\t', '\n', '\f', '\r', ' ': // \s only matches only ASCII whitespace
+			input = InputSpace
+		default:
+			input = InputAny
+		}
+
+		var output int
+		switch state {
+		case StateDefault:
+			switch input {
+			case InputPunct:
+				output, state = OutputNone, StateAfterPunct
+			case InputExtra:
+				output, state = OutputNone, StateDefault
+			case InputSpace:
+				output, state = OutputNone, StateDefault
+			case InputAny:
+				output, state = OutputNone, StateDefault
+			case InputInvalid:
+				output, state = OutputNone, StateDefault
+			case InputEOS:
+				output, state = OutputRest, -1
+			default:
+				panic("unhandled input")
+			}
+		case StateAfterPunct:
+			switch input {
+			case InputPunct:
+				output, state = OutputNone, StateAfterPunct
+			case InputExtra:
+				output, state = OutputNone, StateAfterPunctExtra
+			case InputSpace:
+				output, state = OutputNone, StateAfterSpace
+			case InputAny:
+				output, state = OutputNone, StateDefault
+			case InputInvalid:
+				output, state = OutputNone, StateDefault
+			case InputEOS:
+				output, state = OutputRest, -1
+			default:
+				panic("unhandled input")
+			}
+		case StateAfterPunctExtra:
+			switch input {
+			case InputPunct:
+				output, state = OutputNone, StateAfterPunct
+			case InputExtra:
+				output, state = OutputNone, StateDefault
+			case InputSpace:
+				output, state = OutputNone, StateAfterSpace
+			case InputAny:
+				output, state = OutputNone, StateDefault
+			case InputInvalid:
+				output, state = OutputNone, StateDefault
+			case InputEOS:
+				output, state = OutputRest, -1
+			default:
+				panic("unhandled input")
+			}
+		case StateAfterSpace:
+			switch input {
+			case InputPunct:
+				output, state = OutputNext, StateAfterPunct
+			case InputExtra:
+				output, state = OutputNext, StateDefault
+			case InputSpace:
+				output, state = OutputNone, StateAfterSpace
+			case InputAny:
+				output, state = OutputNext, StateDefault
+			case InputInvalid:
+				output, state = OutputNext, StateDefault
+			case InputEOS:
+				output, state = OutputRest, -1
+			default:
+				panic("unhandled input")
+			}
+		default:
+			panic("unhandled state")
+		}
+
+		switch output {
+		case OutputNone:
+			i += z
+		case OutputNext:
+			sentences = append(sentences, str[:i])
+			str, i = str[i:], z
+		case OutputRest:
+			if len(str) != 0 || len(sentences) == 0 {
+				sentences = append(sentences, str)
+			}
+			if state != -1 {
+				panic("invalid state")
+			}
+		default:
+			panic("unhandled output")
+		}
+	}
+
+	return sentences
+}
+
 func koboSpan(para, seg int) *html.Node {
 	return &html.Node{
 		Type:     html.ElementNode,
@@ -435,6 +565,23 @@ func transformContentClean(doc *html.Node) {
 	}
 }
 
+func transformContentReplacements(w io.Writer, find, replace [][]byte) io.WriteCloser {
+	var t []transform.Transformer
+	if len(find) != len(replace) {
+		panic("find and replace must be the same length")
+	}
+	for i := range find {
+		if len(find[i]) == 0 {
+			continue
+		}
+		t = append(t, &byteReplacer{
+			Find:    find[i],
+			Replace: replace[i],
+		})
+	}
+	return transform.NewWriter(w, transform.Chain(t...))
+}
+
 // withText adds text to a node and returns it.
 func withText(node *html.Node, text string) *html.Node {
 	if node.Type != html.ElementNode {
@@ -534,3 +681,81 @@ func includes(s, token string) bool {
 	}
 	return false
 }
+
+// byteReplacer is a Transformer which finds and replaces sequences of bytes.
+type byteReplacer struct {
+	transform.NopResetter
+	Find, Replace []byte
+}
+
+func (b *byteReplacer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	if len(b.Find) == 0 {
+		panic("find length must not be zero")
+	}
+
+	for {
+		// find the next match
+		i := bytes.Index(src[nSrc:], b.Find)
+		if i == -1 {
+			break
+		}
+
+		// copy the non-matching prefix
+		if n := copy(dst[nDst:], src[nSrc:nSrc+i]); n == len(src[nSrc:nSrc+i]) {
+			nSrc += n
+			nDst += n
+		} else {
+			// skip what we've already processed
+			nSrc += n
+			nDst += n
+			// have it call us again with a larger destination buffer
+			err = transform.ErrShortDst
+			return
+		}
+
+		// copy the new value
+		if n := copy(dst[nDst:], b.Replace); n == len(b.Replace) {
+			nSrc += len(b.Find)
+			nDst += n
+		} else {
+			// have it call us again with a larger destination buffer
+			err = transform.ErrShortDst
+			return
+		}
+	}
+
+	if !atEOF {
+		// skip everything, minus the last len(b.Replace)-1 in case there is another
+		// partial match at the end
+		if skip := len(src[nSrc:]) - (len(b.Find) - 1); skip > 0 {
+			if n := copy(dst[nDst:], src[nSrc:nSrc+skip]); n == len(src[nSrc:nSrc+skip]) {
+				nSrc += n
+				nDst += n
+			} else {
+				// skip what we've already processed
+				nSrc += n
+				nDst += n
+				// have it call us again with a larger destination buffer
+				err = transform.ErrShortDst
+				return
+			}
+		}
+
+		// have it call us again with more source bytes to find another match in
+		err = transform.ErrShortSrc
+		return
+	}
+
+	// at EOF, and no more replacements, so copy the remaining bytes
+	if n := copy(dst[nDst:], src[nSrc:]); n == len(src[nSrc:]) {
+		nDst += n
+		nSrc += n
+	} else {
+		// skip what we've already copied
+		nDst += n
+		nSrc += n
+		// have it call us again with a larger destination buffer
+		err = transform.ErrShortDst
+	}
+	return
+}