diff --git a/hugolib/site_test.go b/hugolib/site_test.go index cd7ce51f8fd..365679a328b 100644 --- a/hugolib/site_test.go +++ b/hugolib/site_test.go @@ -1113,7 +1113,7 @@ ABC. els := stats.HTMLElements b.Assert(els.Classes, qt.HasLen, 3606) // (4 * 900) + 4 +2 - b.Assert(els.Tags, qt.HasLen, 9) + b.Assert(els.Tags, qt.HasLen, 8) b.Assert(els.IDs, qt.HasLen, 1) } } diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go index d9479aafaa5..9f4be1ff5b7 100644 --- a/publisher/htmlElementsCollector.go +++ b/publisher/htmlElementsCollector.go @@ -20,21 +20,10 @@ import ( "strings" "sync" - "github.com/gohugoio/hugo/helpers" "golang.org/x/net/html" -) - -func newHTMLElementsCollector() *htmlElementsCollector { - return &htmlElementsCollector{ - elementSet: make(map[string]bool), - } -} -func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *cssClassCollectorWriter { - return &cssClassCollectorWriter{ - collector: collector, - } -} + "github.com/gohugoio/hugo/helpers" +) // HTMLElements holds lists of tags and attribute values for classes and id. type HTMLElements struct { @@ -59,7 +48,50 @@ func (h *HTMLElements) Sort() { sort.Strings(h.IDs) } -type cssClassCollectorWriter struct { +type htmlElementsCollector struct { + // Contains the raw HTML string. We will get the same element + // several times, and want to avoid costly reparsing when this + // is used for aggregated data only. + elementSet map[string]bool + + elements []htmlElement + + mu sync.RWMutex +} + +func newHTMLElementsCollector() *htmlElementsCollector { + return &htmlElementsCollector{ + elementSet: make(map[string]bool), + } +} + +func (c *htmlElementsCollector) getHTMLElements() HTMLElements { + var ( + classes []string + ids []string + tags []string + ) + + for _, el := range c.elements { + classes = append(classes, el.Classes...) + ids = append(ids, el.IDs...) + tags = append(tags, el.Tag) + } + + classes = helpers.UniqueStringsSorted(classes) + ids = helpers.UniqueStringsSorted(ids) + tags = helpers.UniqueStringsSorted(tags) + + els := HTMLElements{ + Classes: classes, + IDs: ids, + Tags: tags, + } + + return els +} + +type htmlElementsCollectorWriter struct { collector *htmlElementsCollector buff bytes.Buffer @@ -70,11 +102,19 @@ type cssClassCollectorWriter struct { quoteValue byte } -func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) { +func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter { + return &htmlElementsCollectorWriter{ + collector: collector, + } +} + +// Write splits the incoming stream into single html element and writes these into elementSet +func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) { n = len(p) i := 0 for i < len(p) { + // if is not collecting, cycle through byte stream until start bracket "<" is found if !w.isCollecting { for ; i < len(p); i++ { b := p[i] @@ -86,109 +126,89 @@ func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) { } if w.isCollecting { + // if is collecting, cycle through byte stream until end bracket ">" is found + // disregard any ">" if within a quote + // write bytes until found to buffer for ; i < len(p); i++ { b := p[i] w.toggleIfQuote(b) + w.buff.WriteByte(b) + if !w.inQuote && b == '>' { w.endCollecting() break } - w.buff.WriteByte(b) } + } - if !w.isCollecting { - if w.inPreTag != "" { - s := w.buff.String() - if tagName, isEnd := w.parseEndTag(s); isEnd && w.inPreTag == tagName { - w.inPreTag = "" - } - w.buff.Reset() - continue - } - - // First check if we have processed this element before. - w.collector.mu.RLock() - - // See https://github.com/dominikh/go-tools/issues/723 - //lint:ignore S1030 This construct avoids memory allocation for the string. - seen := w.collector.elementSet[string(w.buff.Bytes())] - w.collector.mu.RUnlock() - if seen { - w.buff.Reset() - continue - } - - s := w.buff.String() - - w.buff.Reset() + // if no end bracket ">" is found while collecting, but the stream ended + // this could mean we received chunks of a stream from e.g. the minify functionality + // next if loop will be skipped - if strings.HasPrefix(s, "") { - continue + // at this point we have collected an element line between angle brackets "<" and ">" + if !w.isCollecting { + s := w.buff.String() + w.buff.Reset() + + // filter out unwanted tags + // empty string, just in case + // if within preformatted code blocks
,