-
Notifications
You must be signed in to change notification settings - Fork 357
/
20240325144732_trial-profiler-metrics-migration.tx.up.sql
66 lines (64 loc) · 2.72 KB
/
20240325144732_trial-profiler-metrics-migration.tx.up.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
/*
Migrates historical profiling metrics (in `trial_profiler_metrics`) to `metrics`.
Queries `trial_profiler_metrics` to downsample and shim all system metrics data to fit
generic metrics `metrics` schema.
*/
insert into system_metrics (trial_id, end_time, metrics, trial_run_id, partition_type, metric_group)
-- Long-winded query to parse and massage existing data in `trial_profiler_metrics` to fit the new schema
select
pm.trial_id as trial_id,
-- Use latest time of aggregate GPU groups.
max(pm.ts) as end_time,
(
case pm.mgroup
when 'gpu' then jsonb_build_object(
pm.agent_id,
jsonb_object_agg(
-- GPU UUID should never be null here, but SQL doesn't know that.
coalesce(pm.gpu_uuid, ''), pm.metrics
)
)
else jsonb_object_agg(pm.agent_id, pm.metrics)
end
) as metrics,
coalesce(t.run_id, 1) as trial_run_id,
'PROFILING' as partition_type,
pm.mgroup as metric_group
from
(
-- Query to downsample existing metrics to one per metric name <> group <> agent <> trial <> gpu per second
select
jsonb_object_agg(
-- Change the `free_memory` metric name to `memory_free`.
case
when tpm.labels->>'name'='free_memory' then 'memory_free'
else tpm.labels->>'name'
end,
v
) as metrics,
date_trunc('second', t) as tsec,
-- Use most recent timestamp of this trial <> agent <> gpu group for the 'new' batch
max(t) as ts,
(
case
when tpm.labels->>'name' like 'gpu\_%' then 'gpu'
when tpm.labels->>'name' like 'net\_%' then 'network'
when tpm.labels->>'name' like 'disk\_%' then 'disk'
when tpm.labels->>'name' like 'cpu\_%' then 'cpu'
when tpm.labels->>'name'='free_memory' then 'memory'
end
) as mgroup,
(tpm.labels->>'trialId')::int as trial_id,
tpm.labels->>'agentId' as agent_id,
tpm.labels->>'gpuUuid' as gpu_uuid
from trial_profiler_metrics tpm
-- Only get every 10 reported (time, value) pairs (currently reports every 0.1s).
cross join lateral unnest(tpm.values, tpm.ts) with ordinality as vals(v, t, n)
where
tpm.labels->>'metricType' = 'PROFILER_METRIC_TYPE_SYSTEM'
and vals.n % 10 = 1
group by trial_id, mgroup, tsec, agent_id, gpu_uuid
) as pm
left join trials t on t.id=pm.trial_id
group by pm.trial_id, t.run_id, pm.mgroup, pm.tsec, pm.agent_id
;