From f35a7bf574f670e9ef405a255dc4df13c23a8a7a Mon Sep 17 00:00:00 2001 From: Patrick Gaskin Date: Sat, 12 Jun 2021 17:55:42 -0400 Subject: [PATCH] kepub: Process replacements while rendering the content By using golang.org/x/text/transform instead of rendering to a temporary buffer and replacing in that, we can handle the find/replace option more efficiently. --- go.mod | 1 + go.sum | 1 + kepub/kepub.go | 7 +- kepub/transform.go | 285 +++++++++++++++++++++++++++++++++++----- kepub/transform_test.go | 223 ++++++++++++++++++++++++++++++- 5 files changed, 476 insertions(+), 41 deletions(-) diff --git a/go.mod b/go.mod index ecc5548..b8ea00e 100644 --- a/go.mod +++ b/go.mod @@ -16,6 +16,7 @@ require ( github.com/pgaskin/kepubify/_/go116-zip.go117 v0.0.0-20210611152744-2d89b3182523 github.com/pgaskin/kepubify/_/html v0.0.0-20210611145339-337924fbbaf0 golang.org/x/sync v0.0.0-20201008141435-b3e1573b7520 + golang.org/x/text v0.3.6 ) require github.com/hexops/gotextdiff v1.0.3 diff --git a/go.sum b/go.sum index f0898fe..e58ddb3 100644 --- a/go.sum +++ b/go.sum @@ -18,5 +18,6 @@ github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= golang.org/x/sync v0.0.0-20201008141435-b3e1573b7520 h1:Bx6FllMpG4NWDOfhMBz1VR2QYNp/SAOHPIAsaVmxfPo= golang.org/x/sync v0.0.0-20201008141435-b3e1573b7520/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/text v0.3.6 h1:aRYxNxv6iGQlyVaZmk6ZgYEDa+Jg18DxebPSrd6bg1M= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/kepub/kepub.go b/kepub/kepub.go index a812c9e..3e4d359 100644 --- a/kepub/kepub.go +++ b/kepub/kepub.go @@ -10,8 +10,7 @@ type Converter struct { // smart punctuation smartypants bool - // find/replace in raw html output (note: inefficient, but more efficient - // than working with strings) + // find/replace in raw html output find [][]byte replace [][]byte } @@ -40,9 +39,7 @@ func ConverterOptionSmartypants() ConverterOption { } } -// ConverterOptionFindReplace replaces a raw string in the transformed HTML -// (note that this impacts performance since it requires an additional temporary -// buffer to be created for each document). +// ConverterOptionFindReplace replaces a raw string in the transformed HTML. func ConverterOptionFindReplace(find, replace string) ConverterOption { return func(c *Converter) { c.find = append(c.find, []byte(find)) diff --git a/kepub/transform.go b/kepub/transform.go index d2b6949..a41e391 100644 --- a/kepub/transform.go +++ b/kepub/transform.go @@ -5,13 +5,14 @@ import ( "fmt" "io" "path" - "regexp" "strconv" "strings" "unicode" + "unicode/utf8" "github.com/beevik/etree" "github.com/kr/smartypants" + "golang.org/x/text/transform" "github.com/pgaskin/kepubify/_/html/golang.org/x/net/html" "github.com/pgaskin/kepubify/_/html/golang.org/x/net/html/atom" @@ -173,18 +174,9 @@ func (c *Converter) TransformContent(w io.Writer, r io.Reader) error { transformContentClean(doc) if len(c.find) != 0 { - buf := bytes.NewBuffer(nil) - if err := html.RenderWithOptions(buf, doc, - html.RenderOptionAllowXMLDeclarations(true), - html.RenderOptionPolyglot(true)); err != nil { - return err - } - b := buf.Bytes() - for i := range c.find { - b = bytes.ReplaceAll(b, c.find[i], c.replace[i]) - } - _, err := w.Write(b) - return err + wc := transformContentReplacements(w, c.find, c.replace) + w = wc + defer wc.Close() } err = html.RenderWithOptions(w, doc, @@ -241,8 +233,6 @@ func transformContentKoboDivs(doc *html.Node) { } } -var sentencere = regexp.MustCompile(`((?ms).*?[\.\!\?]['"”’“…]?\s+)`) - func transformContentKoboSpans(doc *html.Node) { // behavior matches Kobo (checked with 3 books) as of 2020-01-12 if findClass(findAtom(doc, atom.Body), "koboSpan") != nil { @@ -256,25 +246,13 @@ func transformContentKoboSpans(doc *html.Node) { var cur *html.Node stack = append(stack, findAtom(doc, atom.Body)) + sentences := make([]string, 0, 8) + for len(stack) != 0 { stack, cur = stack[:len(stack)-1], stack[len(stack)-1] switch cur.Type { case html.TextNode: - // split after each sentence (matches Kobo's behavior, and can't leave anything behind) - var sentences []string - if matches := sentencere.FindAllStringIndex(cur.Data, -1); len(matches) == 0 { - sentences = []string{cur.Data} // nothing matched, use the whole string - } else { - var pos int - sentences = make([]string, len(matches)) - for i, match := range matches { - sentences[i] = cur.Data[pos:match[1]] // end of last match to end of the current one - pos = match[1] - } - if len(cur.Data) > pos { - sentences = append(sentences, cur.Data[pos:]) // rest of the string, if any - } - } + sentences = splitSentences(cur.Data, sentences[:0]) // wrap each sentence in a span (don't wrap whitespace unless it is // directly under a P tag [TODO: are there any other cases we wrap @@ -343,6 +321,158 @@ func transformContentKoboSpans(doc *html.Node) { } } +// splitSentences splits the string into sentences using the rules for creating +// koboSpans. To make this zero-allocation, pass a zero-length slice for +// splitSentences to take ownership of. To re-use the slice, pass the returned +// slice, sliced to zero. If the slice is too small, it will be grown, causing +// an allocation. +// +// This state-machine based implementation is around three times as fast as the +// regexp-based one on average, and even faster when pre-allocating and re-using +// the sentences slice. It should have the same output. For the original +// implementation, see splitSentencesRegexp in the tests. +func splitSentences(str string, sentences []string) []string { + const ( + InputPunct = iota // sentence-terminating punctuation + InputExtra // additional punctuation (one is optionionally consumed after punct if present) + InputSpace // whitespace + InputAny // any valid rune not previously matched + InputInvalid // an invalid byte + InputEOS // end-of-string + ) + const ( + OutputNone = iota // moves to the next rune. + OutputNext // adds everything from the last call up to (but not including) the current rune, and moves to the next rune. + OutputRest // adds everything not yet added by OutputNext (state must be -1) + ) + const ( + StateDefault = iota // in a sentence + StateAfterPunct // after the sentence-terminating rune + StateAfterPunctExtra // after the optional additional punctuation rune + StateAfterSpace // the trailing whitespace after the sentence + ) + + if sentences == nil { + sentences = make([]string, 0, 4) // pre-allocate some room + } + + for i, state := 0, 0; state != -1; { + x, z := utf8.DecodeRuneInString(str[i:]) + + var input int + switch x { + case utf8.RuneError: + switch z { + case 0: + input = InputEOS + default: + input = InputInvalid + } + case '.', '!', '?': + input = InputPunct + case '\'', '"', '”', '’', '“', '…': + input = InputExtra + case '\t', '\n', '\f', '\r', ' ': // \s only matches only ASCII whitespace + input = InputSpace + default: + input = InputAny + } + + var output int + switch state { + case StateDefault: + switch input { + case InputPunct: + output, state = OutputNone, StateAfterPunct + case InputExtra: + output, state = OutputNone, StateDefault + case InputSpace: + output, state = OutputNone, StateDefault + case InputAny: + output, state = OutputNone, StateDefault + case InputInvalid: + output, state = OutputNone, StateDefault + case InputEOS: + output, state = OutputRest, -1 + default: + panic("unhandled input") + } + case StateAfterPunct: + switch input { + case InputPunct: + output, state = OutputNone, StateAfterPunct + case InputExtra: + output, state = OutputNone, StateAfterPunctExtra + case InputSpace: + output, state = OutputNone, StateAfterSpace + case InputAny: + output, state = OutputNone, StateDefault + case InputInvalid: + output, state = OutputNone, StateDefault + case InputEOS: + output, state = OutputRest, -1 + default: + panic("unhandled input") + } + case StateAfterPunctExtra: + switch input { + case InputPunct: + output, state = OutputNone, StateAfterPunct + case InputExtra: + output, state = OutputNone, StateDefault + case InputSpace: + output, state = OutputNone, StateAfterSpace + case InputAny: + output, state = OutputNone, StateDefault + case InputInvalid: + output, state = OutputNone, StateDefault + case InputEOS: + output, state = OutputRest, -1 + default: + panic("unhandled input") + } + case StateAfterSpace: + switch input { + case InputPunct: + output, state = OutputNext, StateAfterPunct + case InputExtra: + output, state = OutputNext, StateDefault + case InputSpace: + output, state = OutputNone, StateAfterSpace + case InputAny: + output, state = OutputNext, StateDefault + case InputInvalid: + output, state = OutputNext, StateDefault + case InputEOS: + output, state = OutputRest, -1 + default: + panic("unhandled input") + } + default: + panic("unhandled state") + } + + switch output { + case OutputNone: + i += z + case OutputNext: + sentences = append(sentences, str[:i]) + str, i = str[i:], z + case OutputRest: + if len(str) != 0 || len(sentences) == 0 { + sentences = append(sentences, str) + } + if state != -1 { + panic("invalid state") + } + default: + panic("unhandled output") + } + } + + return sentences +} + func koboSpan(para, seg int) *html.Node { return &html.Node{ Type: html.ElementNode, @@ -435,6 +565,23 @@ func transformContentClean(doc *html.Node) { } } +func transformContentReplacements(w io.Writer, find, replace [][]byte) io.WriteCloser { + var t []transform.Transformer + if len(find) != len(replace) { + panic("find and replace must be the same length") + } + for i := range find { + if len(find[i]) == 0 { + continue + } + t = append(t, &byteReplacer{ + Find: find[i], + Replace: replace[i], + }) + } + return transform.NewWriter(w, transform.Chain(t...)) +} + // withText adds text to a node and returns it. func withText(node *html.Node, text string) *html.Node { if node.Type != html.ElementNode { @@ -534,3 +681,81 @@ func includes(s, token string) bool { } return false } + +// byteReplacer is a Transformer which finds and replaces sequences of bytes. +type byteReplacer struct { + transform.NopResetter + Find, Replace []byte +} + +func (b *byteReplacer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { + if len(b.Find) == 0 { + panic("find length must not be zero") + } + + for { + // find the next match + i := bytes.Index(src[nSrc:], b.Find) + if i == -1 { + break + } + + // copy the non-matching prefix + if n := copy(dst[nDst:], src[nSrc:nSrc+i]); n == len(src[nSrc:nSrc+i]) { + nSrc += n + nDst += n + } else { + // skip what we've already processed + nSrc += n + nDst += n + // have it call us again with a larger destination buffer + err = transform.ErrShortDst + return + } + + // copy the new value + if n := copy(dst[nDst:], b.Replace); n == len(b.Replace) { + nSrc += len(b.Find) + nDst += n + } else { + // have it call us again with a larger destination buffer + err = transform.ErrShortDst + return + } + } + + if !atEOF { + // skip everything, minus the last len(b.Replace)-1 in case there is another + // partial match at the end + if skip := len(src[nSrc:]) - (len(b.Find) - 1); skip > 0 { + if n := copy(dst[nDst:], src[nSrc:nSrc+skip]); n == len(src[nSrc:nSrc+skip]) { + nSrc += n + nDst += n + } else { + // skip what we've already processed + nSrc += n + nDst += n + // have it call us again with a larger destination buffer + err = transform.ErrShortDst + return + } + } + + // have it call us again with more source bytes to find another match in + err = transform.ErrShortSrc + return + } + + // at EOF, and no more replacements, so copy the remaining bytes + if n := copy(dst[nDst:], src[nSrc:]); n == len(src[nSrc:]) { + nDst += n + nSrc += n + } else { + // skip what we've already copied + nDst += n + nSrc += n + // have it call us again with a larger destination buffer + err = transform.ErrShortDst + } + return +} diff --git a/kepub/transform_test.go b/kepub/transform_test.go index 958ce01..861e465 100644 --- a/kepub/transform_test.go +++ b/kepub/transform_test.go @@ -4,6 +4,7 @@ import ( "bytes" "fmt" "io" + "regexp" "strings" "testing" @@ -16,7 +17,19 @@ func TestTransformContent(t *testing.T) { c := &Converter{ extraCSS: []string{"body { color: black; }"}, extraCSSClass: []string{"kepubify-test"}, - smartypants: true, + find: [][]byte{ + []byte("Test sentence 2."), + []byte(""), + []byte("sdfsdfsdf"), + []byte(" sentence 2"), + }, + replace: [][]byte{ + []byte("Replaced sentence 2."), + []byte(""), + []byte("dfgdfgdfg"), + []byte(nil), + }, + smartypants: true, } // yes, I know it isn't valid XML, but I'm testing preserving the XML declaration @@ -58,7 +71,7 @@ func TestTransformContent(t *testing.T) { in my tests for that library. -->
-

Test sentence 1. Test sentence 2. Test sentence 3Test sentence 4

+

Test sentence 1. Replaced. Test sentence 3Test sentence 4

Test sentence 5. “This is quoted”and this is not.

Sentence.

  • Another sentence.
  • Another sentence.
    • Another sentence.
    • Another sentence.
  • Another sentence.
Another sentence.

Test
@@ -309,6 +322,102 @@ func TestTransformContentParts(t *testing.T) {
 			Out:      `

testing

asdfgh

`, }.Run(t) }) + + t.Run("Replacements", func(t *testing.T) { + const corpus = `Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.` + for _, tc := range []struct { + What string + Replacements []string + }{ + { + What: "simple removal", + Replacements: []string{ + " ipsum", "", + }, + }, + { + What: "simple replacement", + Replacements: []string{ + ". ", "_ ", + }, + }, + { + What: "complex removal", // to ensure the transformer behaves correctly when it requires multiple writes from the renderer + Replacements: []string{ + "Lorem ipsum", "", + }, + }, + { + What: "long replacement", + Replacements: []string{ + "ipsum", strings.Repeat(".", 4096), + }, + }, + { + What: "simple chained", + Replacements: []string{ + "ipsum", "test1", + "amet", "test2", + "commodo", "", + }, + }, + { + What: "ordered chained", + Replacements: []string{ + "ipsum", "Lorem", + "Lorem", "test1", + "ipsum", "", + }, + }, + { + What: "overlapping chained", + Replacements: []string{ + "Lorem", "test1", + "test1", "test2", + "ipsum", "test2 ipsum", + "test2", "test3", + }, + }, + { + What: "complex chained", // to ensure order matters + Replacements: []string{ + "Lorem", "ipsum", + "or", "ar", + "dolar", "dolor", + "")[1], "")[0] // hacky, but the easiest way to do it @@ -558,3 +683,89 @@ func (tc transformXMLTestCase) Run(t *testing.T) { fmt.Println(b) } } + +var testSentences = []string{ + " ! Lorem ipsum dolor, sit amet. Consectetur adipiscing elit?\n Sed do eiusmod tempor incididunt!?! Ut labore et dolore \"magna aliqua.\". Ut enim ad “minim veniam”, quis nostrud exercitation 'ullamco laboris' nisi ut aliquip ex ea commodo consequat?… Duis aute irure dolor in reprehenderit in voluptate velit’s esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. sdfsdfsdf", + "Lorem ipsum dolor, sit amet. Consectetur adipiscing elit? Sed do eiusmod tempor incididunt!?! Ut labore et dolore magna aliqua.", + strings.Repeat("Lorem ipsum dolor sit amet. Consectetur adipiscing elit ut labore et dolore magna aliqua. ", 40), + " ", + "... !!! ??? .'.'.'.' ", + "test\u00a0.\u0080.\u00a0.", + "", + "🌝. 🌝 🌝. 🌝", + "!", + "? ", + "? ?", + "? ", + " ? ", + " ?' .", + " ?' . ", + " ?' . \xFF", + " ?' . \xFF .", + " ?' . \xe2\x82\x28\xFF", + " ?' . \xe2\x82\x28\xFF .", + " ?' . .\xe2\x82\x28\xFF", + " ?' . .\xe2\x82\x28\xFF .", + " ?' . .'\xe2\x82\x28\xFF", + " ?' . .'\xe2\x82\x28\xFF .", + " ?' . .'\xe2\x82\x28\xFF.", +} + +func TestSplitSentences(t *testing.T) { + for _, v := range testSentences { + sss := splitSentences(v, nil) + ssr := splitSentencesRegexp(v) + + if len(sss) == len(ssr) { + for i := range sss { + if sss[i] != ssr[i] { + t.Errorf("%q (new state-machine) != %q (old regexp)", sss, ssr) + } + } + } else { + t.Errorf("%q (new state-machine) != %q (old regexp)", sss, ssr) + } + + if j := strings.Join(sss, ""); j != v { + t.Errorf("%q (joined sentence) != %q (original sentence)", j, v) + } + } +} + +func BenchmarkSplitSentences(b *testing.B) { + b.SetParallelism(1) // for more accurate results + b.Run("Regexp", func(b *testing.B) { + for i := 0; i < b.N; i++ { + for _, v := range testSentences { + splitSentencesRegexp(v) + } + } + }) + b.Run("StateMachine", func(b *testing.B) { + sentences := make([]string, 0, 8) + for i := 0; i < b.N; i++ { + for _, v := range testSentences { + sentences = splitSentences(v, sentences[:0]) + } + } + }) +} + +var sentenceRe = regexp.MustCompile(`((?ms).*?[\.\!\?]['"”’“…]?\s+)`) + +func splitSentencesRegexp(str string) (r []string) { + if matches := sentenceRe.FindAllStringIndex(str, -1); len(matches) == 0 { + r = []string{str} // nothing matched, use the whole string + } else { + var pos int + r = make([]string, len(matches)) + for i, match := range matches { + r[i] = str[pos:match[1]] // end of last match to end of the current one + pos = match[1] + } + if len(str) > pos { + r = append(r, str[pos:]) // rest of the string, if any + } + } + return +}