prometheus · beorn7 · Feb 11, 2019 · Feb 7, 2019 · Feb 11, 2019 · Feb 11, 2019
diff --git a/prometheus/histogram.go b/prometheus/histogram.go
@@ -224,34 +224,35 @@ type histogramCounts struct {
 }
 
 type histogram struct {
-	// countAndHotIdx is a complicated one. For lock-free yet atomic
-	// observations, we need to save the total count of observations again,
-	// combined with the index of the currently-hot counts struct, so that
-	// we can perform the operation on both values atomically. The least
-	// significant bit defines the hot counts struct. The remaining 63 bits
-	// represent the total count of observations. This happens under the
-	// assumption that the 63bit count will never overflow. Rationale: An
-	// observations takes about 30ns. Let's assume it could happen in
-	// 10ns. Overflowing the counter will then take at least (2^63)*10ns,
-	// which is about 3000 years.
+	// CountAndHotIdx enables lock-free writes with use of atomic updates.
+	// The most significant bit is the hot index [0 or 1] of the count field
+	// below. Writes update the hot one. All remaining bits count the number
+	// of writes initiated. Write transactions start by incrementing this
+	// counter, and finish by incrementing the count field in the respective
+	// histogramCounts, as a marker for completion.
 	//
-	// This has to be first in the struct for 64bit alignment. See
+	// Reads swap the hot–cold in a switchMutex lock. A cooldown is awaited
+	// (in such lock) by comparing the number of writes with the initiation
+	// count. Once they match, then the last write transaction on the now
+	// cool one completed. All cool fields must be merged into the new hot
+	// before the unlock of switchMutex.
+	//
+	// Fields with atomic access first! See alignment constraint:
 	// http://golang.org/pkg/sync/atomic/#pkg-note-BUG
 	countAndHotIdx uint64
 
+	// Counts has to be an array of pointers to guarantee 64bit alignment of
+	// the histogramCounts, see
+	// http://golang.org/pkg/sync/atomic/#pkg-note-BUG.
+	counts [2]*histogramCounts
+
+	switchMtx sync.Mutex
+
 	selfCollector
-	desc     *Desc
-	writeMtx sync.Mutex // Only used in the Write method.
+	desc *Desc
 
 	upperBounds []float64
 
-	// Two counts, one is "hot" for lock-free observations, the other is
-	// "cold" for writing out a dto.Metric. It has to be an array of
-	// pointers to guarantee 64bit alignment of the histogramCounts, see
-	// http://golang.org/pkg/sync/atomic/#pkg-note-BUG.
-	counts [2]*histogramCounts
-	hotIdx int // Index of currently-hot counts. Only used within Write.
-
 	labelPairs []*dto.LabelPair
 }
 
@@ -271,11 +272,11 @@ func (h *histogram) Observe(v float64) {
 	// 300 buckets: 154 ns/op linear - binary 61.6 ns/op
 	i := sort.SearchFloat64s(h.upperBounds, v)
 
-	// We increment h.countAndHotIdx by 2 so that the counter in the upper
-	// 63 bits gets incremented by 1. At the same time, we get the new value
+	// We increment h.countAndHotIdx so that the counter in the lower
+	// 63 bits gets incremented. At the same time, we get the new value
 	// back, which we can use to find the currently-hot counts.
-	n := atomic.AddUint64(&h.countAndHotIdx, 2)
-	hotCounts := h.counts[n%2]
+	n := atomic.AddUint64(&h.countAndHotIdx, 1)
+	hotCounts := h.counts[n>>63]
 
 	if i < len(h.upperBounds) {
 		atomic.AddUint64(&hotCounts.buckets[i], 1)
@@ -293,72 +294,39 @@ func (h *histogram) Observe(v float64) {
 }
 
 func (h *histogram) Write(out *dto.Metric) error {
-	var (
-		his                   = &dto.Histogram{}
-		buckets               = make([]*dto.Bucket, len(h.upperBounds))
-		hotCounts, coldCounts *histogramCounts
-		count                 uint64
-	)
-
 	// For simplicity, we mutex the rest of this method. It is not in the
 	// hot path, i.e.  Observe is called much more often than Write. The
 	// complication of making Write lock-free isn't worth it.
-	h.writeMtx.Lock()
-	defer h.writeMtx.Unlock()
-
-	// This is a bit arcane, which is why the following spells out this if
-	// clause in English:
-	//
-	// If the currently-hot counts struct is #0, we atomically increment
-	// h.countAndHotIdx by 1 so that from now on Observe will use the counts
-	// struct #1. Furthermore, the atomic increment gives us the new value,
-	// which, in its most significant 63 bits, tells us the count of
-	// observations done so far up to and including currently ongoing
-	// observations still using the counts struct just changed from hot to
-	// cold. To have a normal uint64 for the count, we bitshift by 1 and
-	// save the result in count. We also set h.hotIdx to 1 for the next
-	// Write call, and we will refer to counts #1 as hotCounts and to counts
-	// #0 as coldCounts.
-	//
-	// If the currently-hot counts struct is #1, we do the corresponding
-	// things the other way round. We have to _decrement_ h.countAndHotIdx
-	// (which is a bit arcane in itself, as we have to express -1 with an
-	// unsigned int...).
-	if h.hotIdx == 0 {
-		count = atomic.AddUint64(&h.countAndHotIdx, 1) >> 1
-		h.hotIdx = 1
-		hotCounts = h.counts[1]
-		coldCounts = h.counts[0]
-	} else {
-		count = atomic.AddUint64(&h.countAndHotIdx, ^uint64(0)) >> 1 // Decrement.
-		h.hotIdx = 0
-		hotCounts = h.counts[0]
-		coldCounts = h.counts[1]
-	}
-
-	// Now we have to wait for the now-declared-cold counts to actually cool
-	// down, i.e. wait for all observations still using it to finish. That's
-	// the case once the count in the cold counts struct is the same as the
-	// one atomically retrieved from the upper 63bits of h.countAndHotIdx.
-	for {
-		if count == atomic.LoadUint64(&coldCounts.count) {
-			break
-		}
+	h.switchMtx.Lock()
+	defer h.switchMtx.Unlock()
+
+	// Adding 1<<63 switches the hot index (from 0 to 1 or from 1 to 0)
+	// without touching the count bits. See the struct comments for a full
+	// description of the algorithm.
+	n := atomic.AddUint64(&h.countAndHotIdx, 1<<63)
+	count := n & ((1 << 63) - 1)
+	hotCounts := h.counts[n>>63]
+	coldCounts := h.counts[(^n)>>63]
+
+	// await cooldown
+	for count != atomic.LoadUint64(&coldCounts.count) {
 		runtime.Gosched() // Let observations get work done.
 	}
 
-	his.SampleCount = proto.Uint64(count)
-	his.SampleSum = proto.Float64(math.Float64frombits(atomic.LoadUint64(&coldCounts.sumBits)))
+	his := &dto.Histogram{
+		Bucket:      make([]*dto.Bucket, len(h.upperBounds)),
+		SampleCount: proto.Uint64(count),
+		SampleSum:   proto.Float64(math.Float64frombits(atomic.LoadUint64(&coldCounts.sumBits))),
+	}
 	var cumCount uint64
 	for i, upperBound := range h.upperBounds {
 		cumCount += atomic.LoadUint64(&coldCounts.buckets[i])
-		buckets[i] = &dto.Bucket{
+		his.Bucket[i] = &dto.Bucket{
 			CumulativeCount: proto.Uint64(cumCount),
 			UpperBound:      proto.Float64(upperBound),
 		}
 	}
 
-	his.Bucket = buckets
 	out.Histogram = his
 	out.Label = h.labelPairs