From 21bde8c707f526a7d3c58eb088f25bca92f4bb88 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Thu, 8 Feb 2018 18:31:48 -0800 Subject: [PATCH] sort symbols in order of frequency rather than lexicographically Signed-off-by: Callum Styan --- block.go | 6 +++--- compact.go | 4 ++-- head.go | 22 +++++++++++----------- head_test.go | 12 ++++++------ index/index.go | 31 +++++++++++++++++++++---------- index/index_test.go | 20 ++++++++++---------- querier_test.go | 10 +++++----- 7 files changed, 58 insertions(+), 47 deletions(-) diff --git a/block.go b/block.go index 981c69eb..ce472d96 100644 --- a/block.go +++ b/block.go @@ -35,7 +35,7 @@ import ( type IndexWriter interface { // AddSymbols registers all string symbols that are encountered in series // and other indices. - AddSymbols(sym map[string]struct{}) error + AddSymbols(sym map[string]int) error // AddSeries populates the index writer with a series and its offsets // of chunks that the index can reference. @@ -62,7 +62,7 @@ type IndexWriter interface { type IndexReader interface { // Symbols returns a set of string symbols that may occur in series' labels // and indices. - Symbols() (map[string]struct{}, error) + Symbols() (map[string]int, error) // LabelValues returns the possible label values. LabelValues(names ...string) (index.StringTuples, error) @@ -375,7 +375,7 @@ type blockIndexReader struct { b *Block } -func (r blockIndexReader) Symbols() (map[string]struct{}, error) { +func (r blockIndexReader) Symbols() (map[string]int, error) { s, err := r.ir.Symbols() return s, errors.Wrapf(err, "block: %s", r.b.Meta().ULID) } diff --git a/compact.go b/compact.go index 3f5fa367..89d534e3 100644 --- a/compact.go +++ b/compact.go @@ -526,7 +526,7 @@ func (c *LeveledCompactor) write(dest string, meta *BlockMeta, blocks ...BlockRe func (c *LeveledCompactor) populateBlock(blocks []BlockReader, meta *BlockMeta, indexw IndexWriter, chunkw ChunkWriter) error { var ( set ChunkSeriesSet - allSymbols = make(map[string]struct{}, 1<<16) + allSymbols = make(map[string]int, 1<<16) closers = []io.Closer{} ) defer func() { closeAll(closers...) }() @@ -555,7 +555,7 @@ func (c *LeveledCompactor) populateBlock(blocks []BlockReader, meta *BlockMeta, return errors.Wrap(err, "read symbols") } for s := range symbols { - allSymbols[s] = struct{}{} + allSymbols[s] = symbols[s] } all, err := indexr.Postings(index.AllPostingsKey()) diff --git a/head.go b/head.go index 8d259fd6..1566ec18 100644 --- a/head.go +++ b/head.go @@ -67,7 +67,7 @@ type Head struct { series *stripeSeries symMtx sync.RWMutex - symbols map[string]struct{} + symbols map[string]int values map[string]stringset // label names to possible values postings *index.MemPostings // postings lists for terms @@ -223,7 +223,7 @@ func NewHead(r prometheus.Registerer, l log.Logger, wal *wal.WAL, chunkRange int maxTime: math.MinInt64, series: newStripeSeries(), values: map[string]stringset{}, - symbols: map[string]struct{}{}, + symbols: make(map[string]int), postings: index.NewUnorderedMemPostings(), tombstones: NewMemTombstones(), } @@ -848,12 +848,12 @@ func (h *Head) gc() { h.postings.Delete(deleted) // Rebuild symbols and label value indices from what is left in the postings terms. - symbols := make(map[string]struct{}) + symbols := make(map[string]int) values := make(map[string]stringset, len(h.values)) if err := h.postings.Iter(func(t labels.Label, _ index.Postings) error { - symbols[t.Name] = struct{}{} - symbols[t.Value] = struct{}{} + symbols[t.Name]++ + symbols[t.Value]++ ss, ok := values[t.Name] if !ok { @@ -997,14 +997,14 @@ func (h *headIndexReader) Close() error { return nil } -func (h *headIndexReader) Symbols() (map[string]struct{}, error) { +func (h *headIndexReader) Symbols() (map[string]int, error) { h.head.symMtx.RLock() defer h.head.symMtx.RUnlock() - res := make(map[string]struct{}, len(h.head.symbols)) + res := make(map[string]int, len(h.head.symbols)) - for s := range h.head.symbols { - res[s] = struct{}{} + for s, num := range h.head.symbols { + res[s] = num } return res, nil } @@ -1136,8 +1136,8 @@ func (h *Head) getOrCreateWithID(id, hash uint64, lset labels.Labels) (*memSerie } valset.set(l.Value) - h.symbols[l.Name] = struct{}{} - h.symbols[l.Value] = struct{}{} + h.symbols[l.Name]++ + h.symbols[l.Value]++ } return s, true diff --git a/head_test.go b/head_test.go index 0392912c..27889337 100644 --- a/head_test.go +++ b/head_test.go @@ -209,12 +209,12 @@ func TestHead_Truncate(t *testing.T) { testutil.Assert(t, postingsB2 == nil, "") testutil.Assert(t, postingsC1 == nil, "") - testutil.Equals(t, map[string]struct{}{ - "": {}, // from 'all' postings list - "a": {}, - "b": {}, - "1": {}, - "2": {}, + testutil.Equals(t, map[string]int{ + "": 2, // from 'all' postings list + "a": 2, + "b": 1, + "1": 2, + "2": 1, }, h.symbols) testutil.Equals(t, map[string]stringset{ diff --git a/index/index.go b/index/index.go index 17acf9ab..0283b84a 100644 --- a/index/index.go +++ b/index/index.go @@ -55,6 +55,17 @@ func (s indexWriterSeriesSlice) Less(i, j int) bool { return labels.Compare(s[i].labels, s[j].labels) < 0 } +type symbolFrequencyPair struct { + symbol string + frequency int +} + +type symbolFrequencylist []symbolFrequencyPair + +func (s symbolFrequencylist) Len() int { return len(s) } +func (s symbolFrequencylist) Swap(i, j int) { s[i], s[j] = s[j], s[i] } +func (s symbolFrequencylist) Less(i, j int) bool { return s[i].frequency < s[j].frequency } + type indexWriterStage uint8 const ( @@ -332,17 +343,17 @@ func (w *Writer) AddSeries(ref uint64, lset labels.Labels, chunks ...chunks.Meta return nil } -func (w *Writer) AddSymbols(sym map[string]struct{}) error { +func (w *Writer) AddSymbols(sym map[string]int) error { if err := w.ensureStage(idxStageSymbols); err != nil { return err } // Generate sorted list of strings we will store as reference table. - symbols := make([]string, 0, len(sym)) + symbols := make(symbolFrequencylist, 0, len(sym)) - for s := range sym { - symbols = append(symbols, s) + for k, v := range sym { + symbols = append(symbols, symbolFrequencyPair{k, v}) } - sort.Strings(symbols) + sort.Sort(sort.Reverse(symbols)) const headerSize = 4 @@ -354,8 +365,8 @@ func (w *Writer) AddSymbols(sym map[string]struct{}) error { w.symbols = make(map[string]uint32, len(symbols)) for index, s := range symbols { - w.symbols[s] = uint32(index) - w.buf2.putUvarintStr(s) + w.symbols[s.symbol] = uint32(index) + w.buf2.putUvarintStr(s.symbol) } w.buf1.putBE32int(w.buf2.len()) @@ -834,11 +845,11 @@ func (r *Reader) lookupSymbol(o uint32) (string, error) { } // Symbols returns a set of symbols that exist within the index. -func (r *Reader) Symbols() (map[string]struct{}, error) { - res := make(map[string]struct{}, len(r.symbols)) +func (r *Reader) Symbols() (map[string]int, error) { + res := make(map[string]int, len(r.symbols)) for _, s := range r.symbols { - res[s] = struct{}{} + res[s] = 0 } return res, nil } diff --git a/index/index_test.go b/index/index_test.go index 8d719813..39aacb9e 100644 --- a/index/index_test.go +++ b/index/index_test.go @@ -191,13 +191,13 @@ func TestIndexRW_Postings(t *testing.T) { labels.FromStrings("a", "1", "b", "4"), } - err = iw.AddSymbols(map[string]struct{}{ - "a": {}, - "b": {}, - "1": {}, - "2": {}, - "3": {}, - "4": {}, + err = iw.AddSymbols(map[string]int{ + "a": 1, + "b": 2, + "1": 1, + "2": 4, + "3": 5, + "4": 3, }) testutil.Ok(t, err) @@ -245,11 +245,11 @@ func TestPersistence_index_e2e(t *testing.T) { // Sort labels as the index writer expects series in sorted order. sort.Sort(labels.Slice(lbls)) - symbols := map[string]struct{}{} + symbols := make(map[string]int) for _, lset := range lbls { for _, l := range lset { - symbols[l.Name] = struct{}{} - symbols[l.Value] = struct{}{} + symbols[l.Name] = 0 + symbols[l.Value] = 0 } } diff --git a/querier_test.go b/querier_test.go index 87a45b64..13f0b7e3 100644 --- a/querier_test.go +++ b/querier_test.go @@ -1390,7 +1390,7 @@ type mockIndex struct { series map[uint64]series labelIndex map[string][]string postings map[labels.Label][]uint64 - symbols map[string]struct{} + symbols map[string]int } func newMockIndex() mockIndex { @@ -1398,12 +1398,12 @@ func newMockIndex() mockIndex { series: make(map[uint64]series), labelIndex: make(map[string][]string), postings: make(map[labels.Label][]uint64), - symbols: make(map[string]struct{}), + symbols: make(map[string]int), } return ix } -func (m mockIndex) Symbols() (map[string]struct{}, error) { +func (m mockIndex) Symbols() (map[string]int, error) { return m.symbols, nil } @@ -1412,8 +1412,8 @@ func (m mockIndex) AddSeries(ref uint64, l labels.Labels, chunks ...chunks.Meta) return errors.Errorf("series with reference %d already added", ref) } for _, lbl := range l { - m.symbols[lbl.Name] = struct{}{} - m.symbols[lbl.Value] = struct{}{} + m.symbols[lbl.Name] = 0 + m.symbols[lbl.Value] = 0 } s := series{l: l}