From c064e4119e96c33807bf170949c1dc8cdf2ce9cb Mon Sep 17 00:00:00 2001 From: Nate Coraor Date: Wed, 15 Nov 2023 13:14:20 -0600 Subject: [PATCH 1/6] Add a query for the tool-input-size-to-memory-usage ratio. --- CHANGELOG.md | 1 + parts/22-query.sh | 98 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b601f429..09d6e8ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ - Added: - filter histogram: replaces bit.ly's data_hacks with a built-in AWK program to calculate a histogram. May not be entirely portable @hexylena. - mutate scale-table-autovacuum: Dynamically update autovacuum and autoanalyze scale for large tables. @natefoo + - query tool-input-to-memory-ratio: Calculate tool-input-to-memory-usage ratio - Fixed: - Replaced hardcoded metric_name with the variable in query_tool-metrics function @sanjaysrikakulam - improved man pages a tad diff --git a/parts/22-query.sh b/parts/22-query.sh index 69388158..ab341ca9 100644 --- a/parts/22-query.sh +++ b/parts/22-query.sh @@ -1299,6 +1299,104 @@ query_tool-metrics() { ##? [last=-1] [--like] [--ok] [--su EOF } +query_tool-input-to-memory-ratio() { ##? [last=-1] [--like] [--ok] [--min-used=0.5] [--summary]: Calculate tool-input-to-memory-usage ratio + meta <<-EOF + ADDED: 22 + AUTHORS: natefoo + EOF + handle_help "$@" <<-EOF + This can be useful for determining if and how input sizes affect memory consumption of a tool. + + $ gxadmin query tool-input-to-memory-ratio %/genrich/% 5 --like + id | bytes_used | gb_used | input_bytes | input_gb | used_per_input + ----------+-------------+---------------------+-------------+--------------------+--------------------- + 53559706 | 10435952640 | 9.7192382812500000 | 558986135 | 0.5205964064225554 | 18.6694302176206929 + 53383034 | 10523262976 | 9.8005523681640625 | 3932595451 | 3.662514920346439 | 2.6759078341821537 + 53383035 | 10563473408 | 9.8380012512207031 | 4243466180 | 3.9520358480513096 | 2.4893502056849196 + 53383033 | 10579628032 | 9.8530464172363281 | 4294959575 | 3.9999928092584014 | 2.4632660324864641 + 53383032 | 14949408768 | 13.9227218627929688 | 4377419515 | 4.076789612881839 | 3.4151190482824902 + (5 rows) + + $ gxadmin query tool-input-to-memory-ratio %/genrich/% --like --ok --min-used=1 --summary + min | quant_1st | median | mean | quant_3rd | perc_95 | perc_99 | max | sum | stddev + ------+-----------+------------------+-----------+---------+---------+------------+------------+---------- + 0.15 | 2 | 4 | 3259.54 | 9 | 580 | 30263 | 1388868.84 | 3976650.06 | 49024.87 + (1 row) + + This query depends on your Galaxy server collecting cgroup metrics; specifically, the + 'memory.max_usage_in_bytes' metric. + + The optional 'last' argument can be used to limit the number of most recent jobs that will be checked. + + Use the --ok option to only include jobs that finished successfully. + + Use the --min-used option (value in GB) to exclude memory usage less than this amount. This data can be + misleading and uselessly skew summary statistics since smaller jobs will use a baseline amount of memory + regardless of input size. + + Use the --summary option to output summary statistics of the ratio instead of the values themselves. + EOF + + read -r -d '' summary <<-EOF + j.id, + jmn.metric_value AS bytes_used, + jmn.metric_value/1024/1024/1024 AS gb_used, + isz.bytes AS input_bytes, isz.bytes::float/1024/1024/1024 AS input_gb, + (jmn.metric_value / isz.bytes) AS used_per_input + EOF + order_by='ORDER BY isz.bytes' + limit_clause= + + tool_subquery="j.tool_id = '$arg_tool_id'" + if [[ -n "$arg_like" ]]; then + tool_subquery="j.tool_id LIKE '$arg_tool_id'" + fi + if [[ -n "$arg_ok" ]]; then + tool_subquery="$tool_subquery AND j.state = 'ok'" + fi + if [[ "$arg_last" -gt 0 ]]; then + limit_clause="ORDER BY j.id DESC LIMIT $arg_last" + fi + if [[ -n "$arg_summary" ]]; then + summary="$(summary_statistics 'jmn.metric_value / isz.bytes' 0)" + order_by= + fi + + read -r -d '' QUERY <<-EOF + WITH input_sizes AS ( + SELECT + j.id AS job_id, + sum(d.total_size) AS bytes + FROM + job j + JOIN + job_to_input_dataset jtid on j.id = jtid.job_id + JOIN + history_dataset_association hda on jtid.dataset_id = hda.id + JOIN + dataset d on hda.dataset_id = d.id + WHERE + $tool_subquery + GROUP BY j.id + $limit_clause + ) + + SELECT + $summary + FROM + job j + JOIN + job_metric_numeric jmn ON j.id = jmn.job_id + JOIN + input_sizes isz ON j.id = isz.job_id + WHERE + jmn.metric_name = 'memory.max_usage_in_bytes' + AND isz.bytes > 0 + AND jmn.metric_value >= ${arg_min_used}*1024*1024*1024 + $order_by + EOF +} + query_tool-available-metrics() { ##? : list all available metrics for a given tool handle_help "$@" <<-EOF Gives a list of available metrics, which can then be used to query. From 5da7504d6300a96659e39737a98d57fa86552116 Mon Sep 17 00:00:00 2001 From: Nate Coraor Date: Wed, 15 Nov 2023 13:54:25 -0600 Subject: [PATCH 2/6] Add some additional options to the tool-input-to-memory-ratio query --- parts/22-query.sh | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/parts/22-query.sh b/parts/22-query.sh index ab341ca9..412fdb63 100644 --- a/parts/22-query.sh +++ b/parts/22-query.sh @@ -1299,7 +1299,7 @@ query_tool-metrics() { ##? [last=-1] [--like] [--ok] [--su EOF } -query_tool-input-to-memory-ratio() { ##? [last=-1] [--like] [--ok] [--min-used=0.5] [--summary]: Calculate tool-input-to-memory-usage ratio +query_tool-input-to-memory-ratio() { ##? [last=-1] [--like] [--ok] [--min-used=0.5] [--max-input=-1] [--input-name=none] [--summary]: Calculate tool-input-to-memory-usage ratio meta <<-EOF ADDED: 22 AUTHORS: natefoo @@ -1328,12 +1328,16 @@ query_tool-input-to-memory-ratio() { ##? [last=-1] [--like] [--ok] [-- The optional 'last' argument can be used to limit the number of most recent jobs that will be checked. - Use the --ok option to only include jobs that finished successfully. + The '--ok' option includes only jobs that finished successfully. - Use the --min-used option (value in GB) to exclude memory usage less than this amount. This data can be + The '--min-used' option (value in GB) excludes memory usage less than this amount. This data can be misleading and uselessly skew summary statistics since smaller jobs will use a baseline amount of memory regardless of input size. + The '--max-input' option (value in GB) excludes input sizes larger than this amount. + + The '--input-name' can be used to limit input selection to a single tool input with the given name. + Use the --summary option to output summary statistics of the ratio instead of the values themselves. EOF @@ -1354,6 +1358,12 @@ query_tool-input-to-memory-ratio() { ##? [last=-1] [--like] [--ok] [-- if [[ -n "$arg_ok" ]]; then tool_subquery="$tool_subquery AND j.state = 'ok'" fi + if [[ "$arg_max_input" -gt 0 ]]; then + tool_subquery="$tool_subquery AND d.total_size <= ${arg_max_input}::float*1024*1024*1024" + fi + if [[ "$arg_input_name" != 'none' ]]; then + tool_subquery="$tool_subquery AND jtid.name = '$arg_input_name'" + fi if [[ "$arg_last" -gt 0 ]]; then limit_clause="ORDER BY j.id DESC LIMIT $arg_last" fi From 21a7e9823d75e26a6b1b2e3f53c094417b1bd5d4 Mon Sep 17 00:00:00 2001 From: Nate Coraor Date: Wed, 15 Nov 2023 15:48:07 -0600 Subject: [PATCH 3/6] Add `--min-input` to input-to-mem query --- parts/22-query.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/parts/22-query.sh b/parts/22-query.sh index 412fdb63..9de830b1 100644 --- a/parts/22-query.sh +++ b/parts/22-query.sh @@ -1299,7 +1299,7 @@ query_tool-metrics() { ##? [last=-1] [--like] [--ok] [--su EOF } -query_tool-input-to-memory-ratio() { ##? [last=-1] [--like] [--ok] [--min-used=0.5] [--max-input=-1] [--input-name=none] [--summary]: Calculate tool-input-to-memory-usage ratio +query_tool-input-to-memory-ratio() { ##? [last=-1] [--like] [--ok] [--min-used=0.5] [--min-input=-1] [--max-input=-1] [--input-name=none] [--summary]: Calculate tool-input-to-memory-usage ratio meta <<-EOF ADDED: 22 AUTHORS: natefoo @@ -1358,7 +1358,10 @@ query_tool-input-to-memory-ratio() { ##? [last=-1] [--like] [--ok] [-- if [[ -n "$arg_ok" ]]; then tool_subquery="$tool_subquery AND j.state = 'ok'" fi - if [[ "$arg_max_input" -gt 0 ]]; then + if [[ "$arg_min_input" != '-1' ]]; then + tool_subquery="$tool_subquery AND d.total_size >= ${arg_min_input}::float*1024*1024*1024" + fi + if [[ "$arg_max_input" != '-1' ]]; then tool_subquery="$tool_subquery AND d.total_size <= ${arg_max_input}::float*1024*1024*1024" fi if [[ "$arg_input_name" != 'none' ]]; then From d0f82df36c0854ae772341c24229b49e6648fd17 Mon Sep 17 00:00:00 2001 From: Nate Coraor Date: Thu, 16 Nov 2023 14:42:45 -0500 Subject: [PATCH 4/6] Update CHANGELOG.md Co-authored-by: Helena --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 09d6e8ec..b5fe5ce0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ - Added: - filter histogram: replaces bit.ly's data_hacks with a built-in AWK program to calculate a histogram. May not be entirely portable @hexylena. - mutate scale-table-autovacuum: Dynamically update autovacuum and autoanalyze scale for large tables. @natefoo - - query tool-input-to-memory-ratio: Calculate tool-input-to-memory-usage ratio + - query tool-input-to-memory-ratio: Calculate tool-input-to-memory-usage ratio @natefoo - Fixed: - Replaced hardcoded metric_name with the variable in query_tool-metrics function @sanjaysrikakulam - improved man pages a tad From bbb1d9e1dda71a36d55caeafd2b8c42674f16b3c Mon Sep 17 00:00:00 2001 From: Nate Coraor Date: Thu, 16 Nov 2023 13:43:13 -0600 Subject: [PATCH 5/6] Cast to float to prevent out of range --- parts/22-query.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parts/22-query.sh b/parts/22-query.sh index 9de830b1..a9fb4c2f 100644 --- a/parts/22-query.sh +++ b/parts/22-query.sh @@ -1405,7 +1405,7 @@ query_tool-input-to-memory-ratio() { ##? [last=-1] [--like] [--ok] [-- WHERE jmn.metric_name = 'memory.max_usage_in_bytes' AND isz.bytes > 0 - AND jmn.metric_value >= ${arg_min_used}*1024*1024*1024 + AND jmn.metric_value >= ${arg_min_used}::float*1024*1024*1024 $order_by EOF } From c84728e9b7ba1ecc5cb81f2d5689e5892135c795 Mon Sep 17 00:00:00 2001 From: Nate Coraor Date: Thu, 12 Sep 2024 12:56:07 -0500 Subject: [PATCH 6/6] Include cgroupsv2 memory metric in query_tool-input-to-memory-ratio, also return size. --- parts/22-query.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/parts/22-query.sh b/parts/22-query.sh index a9fb4c2f..6045bc1c 100644 --- a/parts/22-query.sh +++ b/parts/22-query.sh @@ -1403,7 +1403,7 @@ query_tool-input-to-memory-ratio() { ##? [last=-1] [--like] [--ok] [-- JOIN input_sizes isz ON j.id = isz.job_id WHERE - jmn.metric_name = 'memory.max_usage_in_bytes' + jmn.metric_name IN ('memory.max_usage_in_bytes', 'memory.peak') AND isz.bytes > 0 AND jmn.metric_value >= ${arg_min_used}::float*1024*1024*1024 $order_by @@ -2644,7 +2644,8 @@ query_job-inputs() { ##? : Input datasets to a specific job d.state AS d_state, d.deleted AS d_deleted, d.purged AS d_purged, - d.object_store_id AS object_store_id + d.object_store_id AS object_store_id, + d.total_size/1024/1024/1024 AS gb FROM job j JOIN job_to_input_dataset jtid ON j.id = jtid.job_id