Skip to content

Commit

Permalink
kepub: Process replacements while rendering the content
Browse files Browse the repository at this point in the history
By using golang.org/x/text/transform instead of rendering to a temporary
buffer and replacing in that, we can handle the find/replace option more
efficiently.
  • Loading branch information
pgaskin committed Jul 2, 2021
1 parent 7848002 commit f35a7bf
Show file tree
Hide file tree
Showing 5 changed files with 476 additions and 41 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ require (
github.com/pgaskin/kepubify/_/go116-zip.go117 v0.0.0-20210611152744-2d89b3182523
github.com/pgaskin/kepubify/_/html v0.0.0-20210611145339-337924fbbaf0
golang.org/x/sync v0.0.0-20201008141435-b3e1573b7520
golang.org/x/text v0.3.6
)

require github.com/hexops/gotextdiff v1.0.3
1 change: 1 addition & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,6 @@ github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
golang.org/x/sync v0.0.0-20201008141435-b3e1573b7520 h1:Bx6FllMpG4NWDOfhMBz1VR2QYNp/SAOHPIAsaVmxfPo=
golang.org/x/sync v0.0.0-20201008141435-b3e1573b7520/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/text v0.3.6 h1:aRYxNxv6iGQlyVaZmk6ZgYEDa+Jg18DxebPSrd6bg1M=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
7 changes: 2 additions & 5 deletions kepub/kepub.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ type Converter struct {
// smart punctuation
smartypants bool

// find/replace in raw html output (note: inefficient, but more efficient
// than working with strings)
// find/replace in raw html output
find [][]byte
replace [][]byte
}
Expand Down Expand Up @@ -40,9 +39,7 @@ func ConverterOptionSmartypants() ConverterOption {
}
}

// ConverterOptionFindReplace replaces a raw string in the transformed HTML
// (note that this impacts performance since it requires an additional temporary
// buffer to be created for each document).
// ConverterOptionFindReplace replaces a raw string in the transformed HTML.
func ConverterOptionFindReplace(find, replace string) ConverterOption {
return func(c *Converter) {
c.find = append(c.find, []byte(find))
Expand Down
285 changes: 255 additions & 30 deletions kepub/transform.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@ import (
"fmt"
"io"
"path"
"regexp"
"strconv"
"strings"
"unicode"
"unicode/utf8"

"github.com/beevik/etree"
"github.com/kr/smartypants"
"golang.org/x/text/transform"

"github.com/pgaskin/kepubify/_/html/golang.org/x/net/html"
"github.com/pgaskin/kepubify/_/html/golang.org/x/net/html/atom"
Expand Down Expand Up @@ -173,18 +174,9 @@ func (c *Converter) TransformContent(w io.Writer, r io.Reader) error {
transformContentClean(doc)

if len(c.find) != 0 {
buf := bytes.NewBuffer(nil)
if err := html.RenderWithOptions(buf, doc,
html.RenderOptionAllowXMLDeclarations(true),
html.RenderOptionPolyglot(true)); err != nil {
return err
}
b := buf.Bytes()
for i := range c.find {
b = bytes.ReplaceAll(b, c.find[i], c.replace[i])
}
_, err := w.Write(b)
return err
wc := transformContentReplacements(w, c.find, c.replace)
w = wc
defer wc.Close()
}

err = html.RenderWithOptions(w, doc,
Expand Down Expand Up @@ -241,8 +233,6 @@ func transformContentKoboDivs(doc *html.Node) {
}
}

var sentencere = regexp.MustCompile(`((?ms).*?[\.\!\?]['"”’“…]?\s+)`)

func transformContentKoboSpans(doc *html.Node) {
// behavior matches Kobo (checked with 3 books) as of 2020-01-12
if findClass(findAtom(doc, atom.Body), "koboSpan") != nil {
Expand All @@ -256,25 +246,13 @@ func transformContentKoboSpans(doc *html.Node) {
var cur *html.Node
stack = append(stack, findAtom(doc, atom.Body))

sentences := make([]string, 0, 8)

for len(stack) != 0 {
stack, cur = stack[:len(stack)-1], stack[len(stack)-1]
switch cur.Type {
case html.TextNode:
// split after each sentence (matches Kobo's behavior, and can't leave anything behind)
var sentences []string
if matches := sentencere.FindAllStringIndex(cur.Data, -1); len(matches) == 0 {
sentences = []string{cur.Data} // nothing matched, use the whole string
} else {
var pos int
sentences = make([]string, len(matches))
for i, match := range matches {
sentences[i] = cur.Data[pos:match[1]] // end of last match to end of the current one
pos = match[1]
}
if len(cur.Data) > pos {
sentences = append(sentences, cur.Data[pos:]) // rest of the string, if any
}
}
sentences = splitSentences(cur.Data, sentences[:0])

// wrap each sentence in a span (don't wrap whitespace unless it is
// directly under a P tag [TODO: are there any other cases we wrap
Expand Down Expand Up @@ -343,6 +321,158 @@ func transformContentKoboSpans(doc *html.Node) {
}
}

// splitSentences splits the string into sentences using the rules for creating
// koboSpans. To make this zero-allocation, pass a zero-length slice for
// splitSentences to take ownership of. To re-use the slice, pass the returned
// slice, sliced to zero. If the slice is too small, it will be grown, causing
// an allocation.
//
// This state-machine based implementation is around three times as fast as the
// regexp-based one on average, and even faster when pre-allocating and re-using
// the sentences slice. It should have the same output. For the original
// implementation, see splitSentencesRegexp in the tests.
func splitSentences(str string, sentences []string) []string {
const (
InputPunct = iota // sentence-terminating punctuation
InputExtra // additional punctuation (one is optionionally consumed after punct if present)
InputSpace // whitespace
InputAny // any valid rune not previously matched
InputInvalid // an invalid byte
InputEOS // end-of-string
)
const (
OutputNone = iota // moves to the next rune.
OutputNext // adds everything from the last call up to (but not including) the current rune, and moves to the next rune.
OutputRest // adds everything not yet added by OutputNext (state must be -1)
)
const (
StateDefault = iota // in a sentence
StateAfterPunct // after the sentence-terminating rune
StateAfterPunctExtra // after the optional additional punctuation rune
StateAfterSpace // the trailing whitespace after the sentence
)

if sentences == nil {
sentences = make([]string, 0, 4) // pre-allocate some room
}

for i, state := 0, 0; state != -1; {
x, z := utf8.DecodeRuneInString(str[i:])

var input int
switch x {
case utf8.RuneError:
switch z {
case 0:
input = InputEOS
default:
input = InputInvalid
}
case '.', '!', '?':
input = InputPunct
case '\'', '"', '”', '’', '“', '…':
input = InputExtra
case '\t', '\n', '\f', '\r', ' ': // \s only matches only ASCII whitespace
input = InputSpace
default:
input = InputAny
}

var output int
switch state {
case StateDefault:
switch input {
case InputPunct:
output, state = OutputNone, StateAfterPunct
case InputExtra:
output, state = OutputNone, StateDefault
case InputSpace:
output, state = OutputNone, StateDefault
case InputAny:
output, state = OutputNone, StateDefault
case InputInvalid:
output, state = OutputNone, StateDefault
case InputEOS:
output, state = OutputRest, -1
default:
panic("unhandled input")
}
case StateAfterPunct:
switch input {
case InputPunct:
output, state = OutputNone, StateAfterPunct
case InputExtra:
output, state = OutputNone, StateAfterPunctExtra
case InputSpace:
output, state = OutputNone, StateAfterSpace
case InputAny:
output, state = OutputNone, StateDefault
case InputInvalid:
output, state = OutputNone, StateDefault
case InputEOS:
output, state = OutputRest, -1
default:
panic("unhandled input")
}
case StateAfterPunctExtra:
switch input {
case InputPunct:
output, state = OutputNone, StateAfterPunct
case InputExtra:
output, state = OutputNone, StateDefault
case InputSpace:
output, state = OutputNone, StateAfterSpace
case InputAny:
output, state = OutputNone, StateDefault
case InputInvalid:
output, state = OutputNone, StateDefault
case InputEOS:
output, state = OutputRest, -1
default:
panic("unhandled input")
}
case StateAfterSpace:
switch input {
case InputPunct:
output, state = OutputNext, StateAfterPunct
case InputExtra:
output, state = OutputNext, StateDefault
case InputSpace:
output, state = OutputNone, StateAfterSpace
case InputAny:
output, state = OutputNext, StateDefault
case InputInvalid:
output, state = OutputNext, StateDefault
case InputEOS:
output, state = OutputRest, -1
default:
panic("unhandled input")
}
default:
panic("unhandled state")
}

switch output {
case OutputNone:
i += z
case OutputNext:
sentences = append(sentences, str[:i])
str, i = str[i:], z
case OutputRest:
if len(str) != 0 || len(sentences) == 0 {
sentences = append(sentences, str)
}
if state != -1 {
panic("invalid state")
}
default:
panic("unhandled output")
}
}

return sentences
}

func koboSpan(para, seg int) *html.Node {
return &html.Node{
Type: html.ElementNode,
Expand Down Expand Up @@ -435,6 +565,23 @@ func transformContentClean(doc *html.Node) {
}
}

func transformContentReplacements(w io.Writer, find, replace [][]byte) io.WriteCloser {
var t []transform.Transformer
if len(find) != len(replace) {
panic("find and replace must be the same length")
}
for i := range find {
if len(find[i]) == 0 {
continue
}
t = append(t, &byteReplacer{
Find: find[i],
Replace: replace[i],
})
}
return transform.NewWriter(w, transform.Chain(t...))
}

// withText adds text to a node and returns it.
func withText(node *html.Node, text string) *html.Node {
if node.Type != html.ElementNode {
Expand Down Expand Up @@ -534,3 +681,81 @@ func includes(s, token string) bool {
}
return false
}

// byteReplacer is a Transformer which finds and replaces sequences of bytes.
type byteReplacer struct {
transform.NopResetter
Find, Replace []byte
}

func (b *byteReplacer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if len(b.Find) == 0 {
panic("find length must not be zero")
}

for {
// find the next match
i := bytes.Index(src[nSrc:], b.Find)
if i == -1 {
break
}

// copy the non-matching prefix
if n := copy(dst[nDst:], src[nSrc:nSrc+i]); n == len(src[nSrc:nSrc+i]) {
nSrc += n
nDst += n
} else {
// skip what we've already processed
nSrc += n
nDst += n
// have it call us again with a larger destination buffer
err = transform.ErrShortDst
return
}

// copy the new value
if n := copy(dst[nDst:], b.Replace); n == len(b.Replace) {
nSrc += len(b.Find)
nDst += n
} else {
// have it call us again with a larger destination buffer
err = transform.ErrShortDst
return
}
}

if !atEOF {
// skip everything, minus the last len(b.Replace)-1 in case there is another
// partial match at the end
if skip := len(src[nSrc:]) - (len(b.Find) - 1); skip > 0 {
if n := copy(dst[nDst:], src[nSrc:nSrc+skip]); n == len(src[nSrc:nSrc+skip]) {
nSrc += n
nDst += n
} else {
// skip what we've already processed
nSrc += n
nDst += n
// have it call us again with a larger destination buffer
err = transform.ErrShortDst
return
}
}

// have it call us again with more source bytes to find another match in
err = transform.ErrShortSrc
return
}

// at EOF, and no more replacements, so copy the remaining bytes
if n := copy(dst[nDst:], src[nSrc:]); n == len(src[nSrc:]) {
nDst += n
nSrc += n
} else {
// skip what we've already copied
nDst += n
nSrc += n
// have it call us again with a larger destination buffer
err = transform.ErrShortDst
}
return
}
Loading

0 comments on commit f35a7bf

Please sign in to comment.