From bc80022e033a5462d1a9ce541f40a050994011cc Mon Sep 17 00:00:00 2001 From: Dirk Olbrich Date: Mon, 12 Apr 2021 23:42:51 +0200 Subject: [PATCH] publisher: Exclude comment and doctype elements from writeStats - Reorder code blocks - Rename cssClassCollectorWriter to htmlElementCollectorWriter, as it just collect html element information - Expand benchmark to test for minified and unminified content Fixes #8396 Fixes #8417 --- hugolib/site_test.go | 2 +- publisher/htmlElementsCollector.go | 295 +++++++++++++----------- publisher/htmlElementsCollector_test.go | 150 ++++++++++-- 3 files changed, 292 insertions(+), 155 deletions(-) diff --git a/hugolib/site_test.go b/hugolib/site_test.go index cd7ce51f8fd..365679a328b 100644 --- a/hugolib/site_test.go +++ b/hugolib/site_test.go @@ -1113,7 +1113,7 @@ ABC. els := stats.HTMLElements b.Assert(els.Classes, qt.HasLen, 3606) // (4 * 900) + 4 +2 - b.Assert(els.Tags, qt.HasLen, 9) + b.Assert(els.Tags, qt.HasLen, 8) b.Assert(els.IDs, qt.HasLen, 1) } } diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go index d9479aafaa5..9f4be1ff5b7 100644 --- a/publisher/htmlElementsCollector.go +++ b/publisher/htmlElementsCollector.go @@ -20,21 +20,10 @@ import ( "strings" "sync" - "github.com/gohugoio/hugo/helpers" "golang.org/x/net/html" -) - -func newHTMLElementsCollector() *htmlElementsCollector { - return &htmlElementsCollector{ - elementSet: make(map[string]bool), - } -} -func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *cssClassCollectorWriter { - return &cssClassCollectorWriter{ - collector: collector, - } -} + "github.com/gohugoio/hugo/helpers" +) // HTMLElements holds lists of tags and attribute values for classes and id. type HTMLElements struct { @@ -59,7 +48,50 @@ func (h *HTMLElements) Sort() { sort.Strings(h.IDs) } -type cssClassCollectorWriter struct { +type htmlElementsCollector struct { + // Contains the raw HTML string. We will get the same element + // several times, and want to avoid costly reparsing when this + // is used for aggregated data only. + elementSet map[string]bool + + elements []htmlElement + + mu sync.RWMutex +} + +func newHTMLElementsCollector() *htmlElementsCollector { + return &htmlElementsCollector{ + elementSet: make(map[string]bool), + } +} + +func (c *htmlElementsCollector) getHTMLElements() HTMLElements { + var ( + classes []string + ids []string + tags []string + ) + + for _, el := range c.elements { + classes = append(classes, el.Classes...) + ids = append(ids, el.IDs...) + tags = append(tags, el.Tag) + } + + classes = helpers.UniqueStringsSorted(classes) + ids = helpers.UniqueStringsSorted(ids) + tags = helpers.UniqueStringsSorted(tags) + + els := HTMLElements{ + Classes: classes, + IDs: ids, + Tags: tags, + } + + return els +} + +type htmlElementsCollectorWriter struct { collector *htmlElementsCollector buff bytes.Buffer @@ -70,11 +102,19 @@ type cssClassCollectorWriter struct { quoteValue byte } -func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) { +func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter { + return &htmlElementsCollectorWriter{ + collector: collector, + } +} + +// Write splits the incoming stream into single html element and writes these into elementSet +func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) { n = len(p) i := 0 for i < len(p) { + // if is not collecting, cycle through byte stream until start bracket "<" is found if !w.isCollecting { for ; i < len(p); i++ { b := p[i] @@ -86,109 +126,89 @@ func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) { } if w.isCollecting { + // if is collecting, cycle through byte stream until end bracket ">" is found + // disregard any ">" if within a quote + // write bytes until found to buffer for ; i < len(p); i++ { b := p[i] w.toggleIfQuote(b) + w.buff.WriteByte(b) + if !w.inQuote && b == '>' { w.endCollecting() break } - w.buff.WriteByte(b) } + } - if !w.isCollecting { - if w.inPreTag != "" { - s := w.buff.String() - if tagName, isEnd := w.parseEndTag(s); isEnd && w.inPreTag == tagName { - w.inPreTag = "" - } - w.buff.Reset() - continue - } - - // First check if we have processed this element before. - w.collector.mu.RLock() - - // See /~https://github.com/dominikh/go-tools/issues/723 - //lint:ignore S1030 This construct avoids memory allocation for the string. - seen := w.collector.elementSet[string(w.buff.Bytes())] - w.collector.mu.RUnlock() - if seen { - w.buff.Reset() - continue - } - - s := w.buff.String() - - w.buff.Reset() + // if no end bracket ">" is found while collecting, but the stream ended + // this could mean we received chunks of a stream from e.g. the minify functionality + // next if loop will be skipped - if strings.HasPrefix(s, "" + if !w.isCollecting { + s := w.buff.String() + w.buff.Reset() + + // filter out unwanted tags + // empty string, just in case + // if within preformatted code blocks
, 
`, f("div textarea", "foo textareaclass", "")}, + {"Textarea tags content should be skipped", `
`, f("div textarea", "foo textareaclass", "")}, + {"DOCTYPE should beskipped", ``, f("", "", "")}, + {"Comments should be skipped", ``, f("", "", "")}, + // Issue #8417 + {"Tabs inline", `
d
`, f("div hr", "bar foo", "a")}, + {"Tabs on multiple rows", `
+
d
`, f("div form", "foo", "a b")}, } { for _, minify := range []bool{false, true} { c.Run(fmt.Sprintf("%s--minify-%t", test.name, minify), func(c *qt.C) { w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) - if minify { if skipMinifyTest[test.name] { c.Skip("skip minify test") @@ -152,6 +152,106 @@ func BenchmarkClassCollectorWriter(b *testing.B) { for i := 0; i < b.N; i++ { w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) fmt.Fprint(w, benchHTML) + } +} +const benchHTML = ` + + + +title + + + + +
+ + +
+ + + +

To force
line breaks
in a text,
use the br
element.

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
MonthSavings
January$100
February$200
$300
+ + +` + +func BenchmarkElementsCollectorWriter(b *testing.B) { + b.ReportAllocs() + for i := 0; i < b.N; i++ { + w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) + fmt.Fprint(w, benchHTML) + } +} + +func BenchmarkElementsCollectorWriterMinified(b *testing.B) { + b.ReportAllocs() + v := viper.New() + m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v) + var buf bytes.Buffer + m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML)) + b.ResetTimer() + + for i := 0; i < b.N; i++ { + w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) + fmt.Fprint(w, buf.String()) + } +} + +func BenchmarkElementsCollectorWriterWithMinifyStream(b *testing.B) { + b.ReportAllocs() + v := viper.New() + m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v) + b.ResetTimer() + + for i := 0; i < b.N; i++ { + w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) + m.Minify(media.HTMLType, w, strings.NewReader(benchHTML)) + } +} + +func BenchmarkElementsCollectorWriterWithMinifyString(b *testing.B) { + b.ReportAllocs() + v := viper.New() + m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v) + b.ResetTimer() + + for i := 0; i < b.N; i++ { + var buf bytes.Buffer + m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML)) + w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) + fmt.Fprint(w, buf.String()) } }