-
Notifications
You must be signed in to change notification settings - Fork 25k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add additional BlobCacheMetrics, expose BlobCacheMetrics via SharedBlobCacheService #111730
Changes from 20 commits
5dacb69
c4b6487
ad68e99
181b958
07dfc5a
9c8ee42
752b1ef
c0189f4
7c34720
69d58ec
c823f75
10a32f6
439152c
f8dfabc
5c38dbf
d504094
c8e6d28
03a8f1d
47a0a35
a71d70b
95d2e5f
e18a283
168efe1
f60d341
c938e89
8f5967a
45fb734
71c0b2c
91405f2
03a0140
df97c59
8295d35
a903534
e8edc63
81e2776
b17f316
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
pr: 111730 | ||
summary: "Add callback for copy-to-cache metrics, additional `BlobCacheMetrics`" | ||
area: Store | ||
type: enhancement | ||
issues: [] |
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -7,15 +7,37 @@ | |||
|
||||
package org.elasticsearch.blobcache; | ||||
|
||||
import org.elasticsearch.index.shard.ShardId; | ||||
import org.elasticsearch.telemetry.TelemetryProvider; | ||||
import org.elasticsearch.telemetry.metric.DoubleHistogram; | ||||
import org.elasticsearch.telemetry.metric.LongCounter; | ||||
import org.elasticsearch.telemetry.metric.LongHistogram; | ||||
import org.elasticsearch.telemetry.metric.MeterRegistry; | ||||
|
||||
import java.util.Map; | ||||
import java.util.concurrent.TimeUnit; | ||||
|
||||
public class BlobCacheMetrics { | ||||
private static final String CACHE_POPULATION_REASON_ATTRIBUTE_KEY = "cachePopulationReason"; | ||||
private static final String SHARD_ID_ATTRIBUTE_KEY = "shardId"; | ||||
|
||||
private final LongCounter cacheMissCounter; | ||||
private final LongCounter evictedCountNonZeroFrequency; | ||||
private final LongHistogram cacheMissLoadTimes; | ||||
private final DoubleHistogram cachePopulateThroughput; | ||||
private final LongCounter cachePopulationBytes; | ||||
private final LongCounter cachePopulationTime; | ||||
|
||||
public enum CachePopulationReason { | ||||
/** | ||||
* When warming the cache | ||||
*/ | ||||
Warming, | ||||
/** | ||||
* When the data we need is not in the cache | ||||
*/ | ||||
CacheMiss | ||||
} | ||||
|
||||
public BlobCacheMetrics(MeterRegistry meterRegistry) { | ||||
this( | ||||
|
@@ -33,14 +55,39 @@ public BlobCacheMetrics(MeterRegistry meterRegistry) { | |||
"es.blob_cache.cache_miss_load_times.histogram", | ||||
"The time in milliseconds for populating entries in the blob store resulting from a cache miss, expressed as a histogram.", | ||||
"ms" | ||||
), | ||||
meterRegistry.registerDoubleHistogram( | ||||
"es.blob_cache.populate_throughput.histogram", | ||||
"The throughput when populating the blob store from the cache", | ||||
nicktindall marked this conversation as resolved.
Show resolved
Hide resolved
|
||||
"MB/second" | ||||
), | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use MiB/s to work around |
||||
meterRegistry.registerLongCounter( | ||||
"es.blob_cache.populate_bytes.total", | ||||
"The number of bytes that have been loaded into the cache", | ||||
"bytes" | ||||
), | ||||
meterRegistry.registerLongCounter( | ||||
"es.blob_cache.populate_time.total", | ||||
"The time spent copying data into the cache", | ||||
"milliseconds" | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe I missed the discussion somewhere: I thought this should be a histogram similar to s3 http request time? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We did discuss this, the feeling was that because we've got the throughput distribution, it might give us more flexibility to record population bytes and time as raw totals. Leaving them as raw totals leaves more options for aggregation in the charts (e.g. how much did we download when warming shard X, how long did we spend warming index Y, how much did we download due to warming when that node failed) I don't think you can answer those questions with bytes/time histograms, (I think) they can only tell us the distribution of chunk sizes or chunk download times in some window. |
||||
) | ||||
); | ||||
} | ||||
|
||||
BlobCacheMetrics(LongCounter cacheMissCounter, LongCounter evictedCountNonZeroFrequency, LongHistogram cacheMissLoadTimes) { | ||||
BlobCacheMetrics( | ||||
LongCounter cacheMissCounter, | ||||
LongCounter evictedCountNonZeroFrequency, | ||||
LongHistogram cacheMissLoadTimes, | ||||
DoubleHistogram cachePopulateThroughput, | ||||
LongCounter cachePopulationBytes, | ||||
LongCounter cachePopulationTime | ||||
) { | ||||
this.cacheMissCounter = cacheMissCounter; | ||||
this.evictedCountNonZeroFrequency = evictedCountNonZeroFrequency; | ||||
this.cacheMissLoadTimes = cacheMissLoadTimes; | ||||
this.cachePopulateThroughput = cachePopulateThroughput; | ||||
this.cachePopulationBytes = cachePopulationBytes; | ||||
this.cachePopulationTime = cachePopulationTime; | ||||
nicktindall marked this conversation as resolved.
Show resolved
Hide resolved
|
||||
} | ||||
|
||||
public static BlobCacheMetrics NOOP = new BlobCacheMetrics(TelemetryProvider.NOOP.getMeterRegistry()); | ||||
|
@@ -56,4 +103,47 @@ public LongCounter getEvictedCountNonZeroFrequency() { | |||
public LongHistogram getCacheMissLoadTimes() { | ||||
return cacheMissLoadTimes; | ||||
} | ||||
|
||||
/** | ||||
* Record the various cache population metrics after a chunk is copied to the cache | ||||
* | ||||
* @param totalBytesCopied The total number of bytes copied | ||||
* @param totalCopyTimeNanos The time taken to copy the bytes in nanoseconds | ||||
* @param shardId The shard ID to which the chunk belonged | ||||
* @param cachePopulationReason The reason for the cache being populated | ||||
*/ | ||||
public void recordCachePopulationMetrics( | ||||
int totalBytesCopied, | ||||
long totalCopyTimeNanos, | ||||
nicktindall marked this conversation as resolved.
Show resolved
Hide resolved
|
||||
ShardId shardId, | ||||
CachePopulationReason cachePopulationReason | ||||
) { | ||||
Map<String, Object> metricAttributes = Map.of( | ||||
SHARD_ID_ATTRIBUTE_KEY, | ||||
shardId, | ||||
CACHE_POPULATION_REASON_ATTRIBUTE_KEY, | ||||
cachePopulationReason | ||||
); | ||||
ywangd marked this conversation as resolved.
Show resolved
Hide resolved
|
||||
assert totalBytesCopied > 0 : "We shouldn't be recording zero-sized copies"; | ||||
cachePopulationBytes.incrementBy(totalBytesCopied, metricAttributes); | ||||
|
||||
// This is almost certainly paranoid, but if we had a very fast/small copy with a very coarse nanosecond timer it might happen? | ||||
if (totalCopyTimeNanos > 0) { | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we could add a warning log in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Addressed in b17f316 I couldn't find the warning you were referring to, but I did add one There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. elasticsearch/modules/repository-s3/src/main/java/org/elasticsearch/repositories/s3/S3BlobStore.java Line 229 in 5934190
|
||||
cachePopulateThroughput.record(toMegabytesPerSecond(totalBytesCopied, totalCopyTimeNanos), metricAttributes); | ||||
cachePopulationTime.incrementBy(TimeUnit.NANOSECONDS.toMillis(totalCopyTimeNanos), metricAttributes); | ||||
} | ||||
} | ||||
|
||||
/** | ||||
* Calculate throughput as megabytes/second | ||||
* | ||||
* @param totalBytes The total number of bytes transferred | ||||
* @param totalNanoseconds The time to transfer in nanoseconds | ||||
* @return The throughput as megabytes/second | ||||
*/ | ||||
private double toMegabytesPerSecond(int totalBytes, long totalNanoseconds) { | ||||
double totalSeconds = totalNanoseconds / 1_000_000_000.0; | ||||
double totalMegabytes = totalBytes / 1_000_000.0; | ||||
return totalMegabytes / totalSeconds; | ||||
} | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I used Mebibytes because that's what |
||||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
/* | ||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
* or more contributor license agreements. Licensed under the Elastic License | ||
* 2.0; you may not use this file except in compliance with the Elastic License | ||
* 2.0. | ||
*/ | ||
|
||
package org.elasticsearch.blobcache.shared; | ||
|
||
public interface BlobCachePopulationListener { | ||
|
||
BlobCachePopulationListener NOOP = (bc, ctn) -> {}; | ||
|
||
/** | ||
* Notify of a blob cache population that occurred | ||
* | ||
* Note that <code>bytesCopied</code> is approximate, there are cases where we write | ||
* more than we read, due to page alignment, and there are times when we read more | ||
* than we write, e.g. when filling multiple gaps. Notifiers should try and record | ||
* the larger of those two numbers when invoking this method. | ||
* | ||
* @param bytesCopied The number of bytes copied into the cache | ||
* @param copyTimeNanos The time in nanoseconds taken to copy those bytes | ||
*/ | ||
void onCachePopulation(int bytesCopied, long copyTimeNanos); | ||
} |
This comment was marked as outdated.
Sorry, something went wrong.