From 09b5f47acc6eca3d2d4ed40e437261fb66386de2 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Thu, 8 Feb 2018 18:31:48 -0800 Subject: [PATCH] sort symbols in order of frequency rather than lexicographically Signed-off-by: Callum Styan --- block.go | 6 +++--- compact.go | 4 ++-- head.go | 22 +++++++++++----------- head_test.go | 12 ++++++------ index/index.go | 31 +++++++++++++++++++++---------- index/index_test.go | 20 ++++++++++---------- querier_test.go | 10 +++++----- 7 files changed, 58 insertions(+), 47 deletions(-) diff --git a/block.go b/block.go index e5a66bd9..d0214c7c 100644 --- a/block.go +++ b/block.go @@ -34,7 +34,7 @@ import ( type IndexWriter interface { // AddSymbols registers all string symbols that are encountered in series // and other indices. - AddSymbols(sym map[string]struct{}) error + AddSymbols(sym map[string]int) error // AddSeries populates the index writer with a series and its offsets // of chunks that the index can reference. @@ -61,7 +61,7 @@ type IndexWriter interface { type IndexReader interface { // Symbols returns a set of string symbols that may occur in series' labels // and indices. - Symbols() (map[string]struct{}, error) + Symbols() (map[string]int, error) // LabelValues returns the possible label values. LabelValues(names ...string) (index.StringTuples, error) @@ -368,7 +368,7 @@ type blockIndexReader struct { b *Block } -func (r blockIndexReader) Symbols() (map[string]struct{}, error) { +func (r blockIndexReader) Symbols() (map[string]int, error) { s, err := r.ir.Symbols() return s, errors.Wrapf(err, "block: %s", r.b.Meta().ULID) } diff --git a/compact.go b/compact.go index f8e6ff54..3055686d 100644 --- a/compact.go +++ b/compact.go @@ -548,7 +548,7 @@ func (c *LeveledCompactor) populateBlock(blocks []BlockReader, meta *BlockMeta, var ( set ChunkSeriesSet - allSymbols = make(map[string]struct{}, 1<<16) + allSymbols = make(map[string]int, 1<<16) closers = []io.Closer{} ) defer func() { closeAll(closers...) }() @@ -577,7 +577,7 @@ func (c *LeveledCompactor) populateBlock(blocks []BlockReader, meta *BlockMeta, return errors.Wrap(err, "read symbols") } for s := range symbols { - allSymbols[s] = struct{}{} + allSymbols[s] = symbols[s] } all, err := indexr.Postings(index.AllPostingsKey()) diff --git a/head.go b/head.go index cbc8661f..c2500940 100644 --- a/head.go +++ b/head.go @@ -67,7 +67,7 @@ type Head struct { series *stripeSeries symMtx sync.RWMutex - symbols map[string]struct{} + symbols map[string]int values map[string]stringset // label names to possible values postings *index.MemPostings // postings lists for terms @@ -229,7 +229,7 @@ func NewHead(r prometheus.Registerer, l log.Logger, wal *wal.WAL, chunkRange int maxTime: math.MinInt64, series: newStripeSeries(), values: map[string]stringset{}, - symbols: map[string]struct{}{}, + symbols: make(map[string]int), postings: index.NewUnorderedMemPostings(), tombstones: newMemTombstones(), } @@ -897,12 +897,12 @@ func (h *Head) gc() { h.postings.Delete(deleted) // Rebuild symbols and label value indices from what is left in the postings terms. - symbols := make(map[string]struct{}) + symbols := make(map[string]int) values := make(map[string]stringset, len(h.values)) if err := h.postings.Iter(func(t labels.Label, _ index.Postings) error { - symbols[t.Name] = struct{}{} - symbols[t.Value] = struct{}{} + symbols[t.Name]++ + symbols[t.Value]++ ss, ok := values[t.Name] if !ok { @@ -1046,14 +1046,14 @@ func (h *headIndexReader) Close() error { return nil } -func (h *headIndexReader) Symbols() (map[string]struct{}, error) { +func (h *headIndexReader) Symbols() (map[string]int, error) { h.head.symMtx.RLock() defer h.head.symMtx.RUnlock() - res := make(map[string]struct{}, len(h.head.symbols)) + res := make(map[string]int, len(h.head.symbols)) - for s := range h.head.symbols { - res[s] = struct{}{} + for s, num := range h.head.symbols { + res[s] = num } return res, nil } @@ -1202,8 +1202,8 @@ func (h *Head) getOrCreateWithID(id, hash uint64, lset labels.Labels) (*memSerie } valset.set(l.Value) - h.symbols[l.Name] = struct{}{} - h.symbols[l.Value] = struct{}{} + h.symbols[l.Name]++ + h.symbols[l.Value]++ } return s, true diff --git a/head_test.go b/head_test.go index 8781f677..d8f93380 100644 --- a/head_test.go +++ b/head_test.go @@ -211,12 +211,12 @@ func TestHead_Truncate(t *testing.T) { testutil.Assert(t, postingsB2 == nil, "") testutil.Assert(t, postingsC1 == nil, "") - testutil.Equals(t, map[string]struct{}{ - "": {}, // from 'all' postings list - "a": {}, - "b": {}, - "1": {}, - "2": {}, + testutil.Equals(t, map[string]int{ + "": 2, // from 'all' postings list + "a": 2, + "b": 1, + "1": 2, + "2": 1, }, h.symbols) testutil.Equals(t, map[string]stringset{ diff --git a/index/index.go b/index/index.go index 6413a9fc..a5f095ec 100644 --- a/index/index.go +++ b/index/index.go @@ -57,6 +57,17 @@ func (s indexWriterSeriesSlice) Less(i, j int) bool { return labels.Compare(s[i].labels, s[j].labels) < 0 } +type symbolFrequencyPair struct { + symbol string + frequency int +} + +type symbolFrequencylist []symbolFrequencyPair + +func (s symbolFrequencylist) Len() int { return len(s) } +func (s symbolFrequencylist) Swap(i, j int) { s[i], s[j] = s[j], s[i] } +func (s symbolFrequencylist) Less(i, j int) bool { return s[i].frequency < s[j].frequency } + type indexWriterStage uint8 const ( @@ -334,17 +345,17 @@ func (w *Writer) AddSeries(ref uint64, lset labels.Labels, chunks ...chunks.Meta return nil } -func (w *Writer) AddSymbols(sym map[string]struct{}) error { +func (w *Writer) AddSymbols(sym map[string]int) error { if err := w.ensureStage(idxStageSymbols); err != nil { return err } // Generate sorted list of strings we will store as reference table. - symbols := make([]string, 0, len(sym)) + symbols := make(symbolFrequencylist, 0, len(sym)) - for s := range sym { - symbols = append(symbols, s) + for k, v := range sym { + symbols = append(symbols, symbolFrequencyPair{k, v}) } - sort.Strings(symbols) + sort.Sort(sort.Reverse(symbols)) const headerSize = 4 @@ -356,8 +367,8 @@ func (w *Writer) AddSymbols(sym map[string]struct{}) error { w.symbols = make(map[string]uint32, len(symbols)) for index, s := range symbols { - w.symbols[s] = uint32(index) - w.buf2.putUvarintStr(s) + w.symbols[s.symbol] = uint32(index) + w.buf2.putUvarintStr(s.symbol) } w.buf1.putBE32int(w.buf2.len()) @@ -856,11 +867,11 @@ func (r *Reader) lookupSymbol(o uint32) (string, error) { } // Symbols returns a set of symbols that exist within the index. -func (r *Reader) Symbols() (map[string]struct{}, error) { - res := make(map[string]struct{}, len(r.symbols)) +func (r *Reader) Symbols() (map[string]int, error) { + res := make(map[string]int, len(r.symbols)) for _, s := range r.symbols { - res[s] = struct{}{} + res[s] = 0 } for _, s := range r.symbolSlice { res[s] = struct{}{} diff --git a/index/index_test.go b/index/index_test.go index f915bca2..7de7e348 100644 --- a/index/index_test.go +++ b/index/index_test.go @@ -189,13 +189,13 @@ func TestIndexRW_Postings(t *testing.T) { labels.FromStrings("a", "1", "b", "4"), } - err = iw.AddSymbols(map[string]struct{}{ - "a": {}, - "b": {}, - "1": {}, - "2": {}, - "3": {}, - "4": {}, + err = iw.AddSymbols(map[string]int{ + "a": 1, + "b": 2, + "1": 1, + "2": 4, + "3": 5, + "4": 3, }) testutil.Ok(t, err) @@ -243,11 +243,11 @@ func TestPersistence_index_e2e(t *testing.T) { // Sort labels as the index writer expects series in sorted order. sort.Sort(labels.Slice(lbls)) - symbols := map[string]struct{}{} + symbols := make(map[string]int) for _, lset := range lbls { for _, l := range lset { - symbols[l.Name] = struct{}{} - symbols[l.Value] = struct{}{} + symbols[l.Name] = 0 + symbols[l.Value] = 0 } } diff --git a/querier_test.go b/querier_test.go index 79dfbff7..2720f7a2 100644 --- a/querier_test.go +++ b/querier_test.go @@ -1345,7 +1345,7 @@ type mockIndex struct { series map[uint64]series labelIndex map[string][]string postings map[labels.Label][]uint64 - symbols map[string]struct{} + symbols map[string]int } func newMockIndex() mockIndex { @@ -1353,12 +1353,12 @@ func newMockIndex() mockIndex { series: make(map[uint64]series), labelIndex: make(map[string][]string), postings: make(map[labels.Label][]uint64), - symbols: make(map[string]struct{}), + symbols: make(map[string]int), } return ix } -func (m mockIndex) Symbols() (map[string]struct{}, error) { +func (m mockIndex) Symbols() (map[string]int, error) { return m.symbols, nil } @@ -1367,8 +1367,8 @@ func (m mockIndex) AddSeries(ref uint64, l labels.Labels, chunks ...chunks.Meta) return errors.Errorf("series with reference %d already added", ref) } for _, lbl := range l { - m.symbols[lbl.Name] = struct{}{} - m.symbols[lbl.Value] = struct{}{} + m.symbols[lbl.Name] = 0 + m.symbols[lbl.Value] = 0 } s := series{l: l}