From 508d576dc369bd538a46ce759fe5b3cb619104e2 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Thu, 8 Feb 2018 18:31:48 -0800 Subject: [PATCH] sort symbols in order of frequency rather than lexicographically --- block.go | 6 +++--- compact.go | 4 ++-- head.go | 20 ++++++++++---------- head_test.go | 12 ++++++------ index/index.go | 31 +++++++++++++++++++++---------- index/index_test.go | 20 ++++++++++---------- querier_test.go | 10 +++++----- 7 files changed, 57 insertions(+), 46 deletions(-) diff --git a/block.go b/block.go index 7dc3af9d..da2f6dae 100644 --- a/block.go +++ b/block.go @@ -34,7 +34,7 @@ import ( type IndexWriter interface { // AddSymbols registers all string symbols that are encountered in series // and other indices. - AddSymbols(sym map[string]struct{}) error + AddSymbols(sym map[string]int) error // AddSeries populates the index writer with a series and its offsets // of chunks that the index can reference. @@ -61,7 +61,7 @@ type IndexWriter interface { type IndexReader interface { // Symbols returns a set of string symbols that may occur in series' labels // and indices. - Symbols() (map[string]struct{}, error) + Symbols() (map[string]int, error) // LabelValues returns the possible label values. LabelValues(names ...string) (index.StringTuples, error) @@ -350,7 +350,7 @@ type blockIndexReader struct { b *Block } -func (r blockIndexReader) Symbols() (map[string]struct{}, error) { +func (r blockIndexReader) Symbols() (map[string]int, error) { s, err := r.ir.Symbols() return s, errors.Wrapf(err, "block: %s", r.b.Meta().ULID) } diff --git a/compact.go b/compact.go index 16a3bd74..46169c0c 100644 --- a/compact.go +++ b/compact.go @@ -509,7 +509,7 @@ func (c *LeveledCompactor) write(dest string, meta *BlockMeta, blocks ...BlockRe func (c *LeveledCompactor) populateBlock(blocks []BlockReader, meta *BlockMeta, indexw IndexWriter, chunkw ChunkWriter) error { var ( set ChunkSeriesSet - allSymbols = make(map[string]struct{}, 1<<16) + allSymbols = make(map[string]int, 1<<16) closers = []io.Closer{} ) defer func() { closeAll(closers...) }() @@ -538,7 +538,7 @@ func (c *LeveledCompactor) populateBlock(blocks []BlockReader, meta *BlockMeta, return errors.Wrap(err, "read symbols") } for s := range symbols { - allSymbols[s] = struct{}{} + allSymbols[s] = symbols[s] } all, err := indexr.Postings(index.AllPostingsKey()) diff --git a/head.go b/head.go index c76c139d..eb901482 100644 --- a/head.go +++ b/head.go @@ -64,7 +64,7 @@ type Head struct { series *stripeSeries symMtx sync.RWMutex - symbols map[string]struct{} + symbols map[string]int values map[string]stringset // label names to possible values postings *index.MemPostings // postings lists for terms @@ -187,7 +187,7 @@ func NewHead(r prometheus.Registerer, l log.Logger, wal WAL, chunkRange int64) ( maxTime: math.MinInt64, series: newStripeSeries(), values: map[string]stringset{}, - symbols: map[string]struct{}{}, + symbols: make(map[string]int), postings: index.NewUnorderedMemPostings(), tombstones: memTombstones{}, } @@ -623,12 +623,12 @@ func (h *Head) gc() { h.postings.Delete(deleted) // Rebuild symbols and label value indices from what is left in the postings terms. - symbols := make(map[string]struct{}) + symbols := make(map[string]int) values := make(map[string]stringset, len(h.values)) h.postings.Iter(func(t labels.Label, _ index.Postings) error { - symbols[t.Name] = struct{}{} - symbols[t.Value] = struct{}{} + symbols[t.Name]++ + symbols[t.Value]++ ss, ok := values[t.Name] if !ok { @@ -771,14 +771,14 @@ func (h *headIndexReader) Close() error { return nil } -func (h *headIndexReader) Symbols() (map[string]struct{}, error) { +func (h *headIndexReader) Symbols() (map[string]int, error) { h.head.symMtx.RLock() defer h.head.symMtx.RUnlock() - res := make(map[string]struct{}, len(h.head.symbols)) + res := make(map[string]int, len(h.head.symbols)) for s := range h.head.symbols { - res[s] = struct{}{} + res[s] = 0 } return res, nil } @@ -910,8 +910,8 @@ func (h *Head) getOrCreateWithID(id, hash uint64, lset labels.Labels) (*memSerie } valset.set(l.Value) - h.symbols[l.Name] = struct{}{} - h.symbols[l.Value] = struct{}{} + h.symbols[l.Name]++ + h.symbols[l.Value]++ } return s, true diff --git a/head_test.go b/head_test.go index 36a97542..73593f77 100644 --- a/head_test.go +++ b/head_test.go @@ -178,12 +178,12 @@ func TestHead_Truncate(t *testing.T) { testutil.Assert(t, postingsB2 == nil, "") testutil.Assert(t, postingsC1 == nil, "") - testutil.Equals(t, map[string]struct{}{ - "": struct{}{}, // from 'all' postings list - "a": struct{}{}, - "b": struct{}{}, - "1": struct{}{}, - "2": struct{}{}, + testutil.Equals(t, map[string]int{ + "": 2, // from 'all' postings list + "a": 2, + "b": 1, + "1": 2, + "2": 1, }, h.symbols) testutil.Equals(t, map[string]stringset{ diff --git a/index/index.go b/index/index.go index 3cad716c..d9e1578d 100644 --- a/index/index.go +++ b/index/index.go @@ -55,6 +55,17 @@ func (s indexWriterSeriesSlice) Less(i, j int) bool { return labels.Compare(s[i].labels, s[j].labels) < 0 } +type symbolFrequencyPair struct { + symbol string + frequency int +} + +type symbolFrequencylist []symbolFrequencyPair + +func (s symbolFrequencylist) Len() int { return len(s) } +func (s symbolFrequencylist) Swap(i, j int) { s[i], s[j] = s[j], s[i] } +func (s symbolFrequencylist) Less(i, j int) bool { return s[i].frequency < s[j].frequency } + type indexWriterStage uint8 const ( @@ -330,17 +341,17 @@ func (w *Writer) AddSeries(ref uint64, lset labels.Labels, chunks ...chunks.Meta return nil } -func (w *Writer) AddSymbols(sym map[string]struct{}) error { +func (w *Writer) AddSymbols(sym map[string]int) error { if err := w.ensureStage(idxStageSymbols); err != nil { return err } // Generate sorted list of strings we will store as reference table. - symbols := make([]string, 0, len(sym)) + symbols := make(symbolFrequencylist, 0, len(sym)) - for s := range sym { - symbols = append(symbols, s) + for k, v := range sym { + symbols = append(symbols, symbolFrequencyPair{k, v}) } - sort.Strings(symbols) + sort.Sort(sort.Reverse(symbols)) const headerSize = 4 @@ -352,8 +363,8 @@ func (w *Writer) AddSymbols(sym map[string]struct{}) error { w.symbols = make(map[string]uint32, len(symbols)) for index, s := range symbols { - w.symbols[s] = uint32(index) - w.buf2.putUvarintStr(s) + w.symbols[s.symbol] = uint32(index) + w.buf2.putUvarintStr(s.symbol) } w.buf1.putBE32int(w.buf2.len()) @@ -832,11 +843,11 @@ func (r *Reader) lookupSymbol(o uint32) (string, error) { } // Symbols returns a set of symbols that exist within the index. -func (r *Reader) Symbols() (map[string]struct{}, error) { - res := make(map[string]struct{}, len(r.symbols)) +func (r *Reader) Symbols() (map[string]int, error) { + res := make(map[string]int, len(r.symbols)) for _, s := range r.symbols { - res[s] = struct{}{} + res[s] = 0 } return res, nil } diff --git a/index/index_test.go b/index/index_test.go index 83b6ef65..14b90f3a 100644 --- a/index/index_test.go +++ b/index/index_test.go @@ -191,13 +191,13 @@ func TestIndexRW_Postings(t *testing.T) { labels.FromStrings("a", "1", "b", "4"), } - err = iw.AddSymbols(map[string]struct{}{ - "a": struct{}{}, - "b": struct{}{}, - "1": struct{}{}, - "2": struct{}{}, - "3": struct{}{}, - "4": struct{}{}, + err = iw.AddSymbols(map[string]int{ + "a": 1, + "b": 2, + "1": 1, + "2": 4, + "3": 5, + "4": 3, }) testutil.Ok(t, err) @@ -245,11 +245,11 @@ func TestPersistence_index_e2e(t *testing.T) { // Sort labels as the index writer expects series in sorted order. sort.Sort(labels.Slice(lbls)) - symbols := map[string]struct{}{} + symbols := make(map[string]int) for _, lset := range lbls { for _, l := range lset { - symbols[l.Name] = struct{}{} - symbols[l.Value] = struct{}{} + symbols[l.Name] = 0 + symbols[l.Value] = 0 } } diff --git a/querier_test.go b/querier_test.go index 2eb10471..ad07618c 100644 --- a/querier_test.go +++ b/querier_test.go @@ -1367,7 +1367,7 @@ type mockIndex struct { series map[uint64]series labelIndex map[string][]string postings map[labels.Label][]uint64 - symbols map[string]struct{} + symbols map[string]int } func newMockIndex() mockIndex { @@ -1375,12 +1375,12 @@ func newMockIndex() mockIndex { series: make(map[uint64]series), labelIndex: make(map[string][]string), postings: make(map[labels.Label][]uint64), - symbols: make(map[string]struct{}), + symbols: make(map[string]int), } return ix } -func (m mockIndex) Symbols() (map[string]struct{}, error) { +func (m mockIndex) Symbols() (map[string]int, error) { return m.symbols, nil } @@ -1389,8 +1389,8 @@ func (m mockIndex) AddSeries(ref uint64, l labels.Labels, chunks ...chunks.Meta) return errors.Errorf("series with reference %d already added", ref) } for _, lbl := range l { - m.symbols[lbl.Name] = struct{}{} - m.symbols[lbl.Value] = struct{}{} + m.symbols[lbl.Name] = 0 + m.symbols[lbl.Value] = 0 } s := series{l: l}