From c064e4119e96c33807bf170949c1dc8cdf2ce9cb Mon Sep 17 00:00:00 2001
From: Nate Coraor <nate@bx.psu.edu>
Date: Wed, 15 Nov 2023 13:14:20 -0600
Subject: [PATCH 1/6] Add a query for the tool-input-size-to-memory-usage
 ratio.

---
 CHANGELOG.md      |  1 +
 parts/22-query.sh | 98 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b601f429..09d6e8ec 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,7 @@
 - Added:
 	- filter histogram: replaces bit.ly's data_hacks with a built-in AWK program to calculate a histogram. May not be entirely portable @hexylena.
 	- mutate scale-table-autovacuum: Dynamically update autovacuum and autoanalyze scale for large tables. @natefoo
+	- query tool-input-to-memory-ratio: Calculate tool-input-to-memory-usage ratio
 - Fixed:
 	- Replaced hardcoded metric_name with the variable in query_tool-metrics function @sanjaysrikakulam
 	- improved man pages a tad
diff --git a/parts/22-query.sh b/parts/22-query.sh
index 69388158..ab341ca9 100644
--- a/parts/22-query.sh
+++ b/parts/22-query.sh
@@ -1299,6 +1299,104 @@ query_tool-metrics() { ##? <tool_id> <metric_id> [last=-1] [--like] [--ok] [--su
 	EOF
 }
 
+query_tool-input-to-memory-ratio() { ##? <tool_id> [last=-1] [--like] [--ok] [--min-used=0.5] [--summary]: Calculate tool-input-to-memory-usage ratio
+	meta <<-EOF
+		ADDED: 22
+		AUTHORS: natefoo
+	EOF
+	handle_help "$@" <<-EOF
+		This can be useful for determining if and how input sizes affect memory consumption of a tool.
+
+		$ gxadmin query tool-input-to-memory-ratio %/genrich/% 5 --like
+		    id    |  bytes_used |       gb_used       | input_bytes |      input_gb      |   used_per_input
+		----------+-------------+---------------------+-------------+--------------------+---------------------
+		 53559706 | 10435952640 |  9.7192382812500000 |   558986135 | 0.5205964064225554 | 18.6694302176206929
+		 53383034 | 10523262976 |  9.8005523681640625 |  3932595451 |  3.662514920346439 |  2.6759078341821537
+		 53383035 | 10563473408 |  9.8380012512207031 |  4243466180 | 3.9520358480513096 |  2.4893502056849196
+		 53383033 | 10579628032 |  9.8530464172363281 |  4294959575 | 3.9999928092584014 |  2.4632660324864641
+		 53383032 | 14949408768 | 13.9227218627929688 |  4377419515 |  4.076789612881839 |  3.4151190482824902
+		(5 rows)
+
+		$ gxadmin query tool-input-to-memory-ratio %/genrich/% --like --ok --min-used=1 --summary
+		  min | quant_1st | median |   mean  | quant_3rd | perc_95 | perc_99 |    max     |    sum     |  stddev
+		------+-----------+------------------+-----------+---------+---------+------------+------------+----------
+		 0.15 |         2 |      4 | 3259.54 |         9 |     580 |   30263 | 1388868.84 | 3976650.06 | 49024.87
+		(1 row)
+
+		This query depends on your Galaxy server collecting cgroup metrics; specifically, the
+		'memory.max_usage_in_bytes' metric.
+
+		The optional 'last' argument can be used to limit the number of most recent jobs that will be checked.
+
+		Use the --ok option to only include jobs that finished successfully.
+
+		Use the --min-used option (value in GB) to exclude memory usage less than this amount. This data can be
+		misleading and uselessly skew summary statistics since smaller jobs will use a baseline amount of memory
+		regardless of input size.
+
+		Use the --summary option to output summary statistics of the ratio instead of the values themselves.
+	EOF
+
+	read -r -d '' summary <<-EOF
+		j.id,
+		jmn.metric_value AS bytes_used,
+		jmn.metric_value/1024/1024/1024 AS gb_used,
+		isz.bytes AS input_bytes, isz.bytes::float/1024/1024/1024 AS input_gb,
+		(jmn.metric_value / isz.bytes) AS used_per_input
+	EOF
+	order_by='ORDER BY isz.bytes'
+	limit_clause=
+
+	tool_subquery="j.tool_id = '$arg_tool_id'"
+	if [[ -n "$arg_like" ]]; then
+		tool_subquery="j.tool_id LIKE '$arg_tool_id'"
+	fi
+	if [[ -n "$arg_ok" ]]; then
+		tool_subquery="$tool_subquery AND j.state = 'ok'"
+	fi
+	if [[ "$arg_last" -gt 0 ]]; then
+		limit_clause="ORDER BY j.id DESC LIMIT $arg_last"
+	fi
+	if [[ -n "$arg_summary" ]]; then
+		summary="$(summary_statistics 'jmn.metric_value / isz.bytes' 0)"
+		order_by=
+	fi
+
+	read -r -d '' QUERY <<-EOF
+		WITH input_sizes AS (
+			SELECT
+				j.id AS job_id,
+				sum(d.total_size) AS bytes
+			FROM
+				job j
+			JOIN
+				job_to_input_dataset jtid on j.id = jtid.job_id
+			JOIN
+				history_dataset_association hda on jtid.dataset_id = hda.id
+			JOIN
+				dataset d on hda.dataset_id = d.id
+			WHERE
+				$tool_subquery
+			GROUP BY j.id
+			$limit_clause
+		)
+
+		SELECT
+			$summary
+		FROM
+			job j
+		JOIN
+			job_metric_numeric jmn ON j.id = jmn.job_id
+		JOIN
+			input_sizes isz ON j.id = isz.job_id
+		WHERE
+			jmn.metric_name = 'memory.max_usage_in_bytes'
+			AND isz.bytes > 0
+			AND jmn.metric_value >= ${arg_min_used}*1024*1024*1024
+		$order_by
+	EOF
+}
+
 query_tool-available-metrics() { ##? <tool_id>: list all available metrics for a given tool
 	handle_help "$@" <<-EOF
 		Gives a list of available metrics, which can then be used to query.

From 5da7504d6300a96659e39737a98d57fa86552116 Mon Sep 17 00:00:00 2001
From: Nate Coraor <nate@bx.psu.edu>
Date: Wed, 15 Nov 2023 13:54:25 -0600
Subject: [PATCH 2/6] Add some additional options to the
 tool-input-to-memory-ratio query

---
 parts/22-query.sh | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/parts/22-query.sh b/parts/22-query.sh
index ab341ca9..412fdb63 100644
--- a/parts/22-query.sh
+++ b/parts/22-query.sh
@@ -1299,7 +1299,7 @@ query_tool-metrics() { ##? <tool_id> <metric_id> [last=-1] [--like] [--ok] [--su
 	EOF
 }
 
-query_tool-input-to-memory-ratio() { ##? <tool_id> [last=-1] [--like] [--ok] [--min-used=0.5] [--summary]: Calculate tool-input-to-memory-usage ratio
+query_tool-input-to-memory-ratio() { ##? <tool_id> [last=-1] [--like] [--ok] [--min-used=0.5] [--max-input=-1] [--input-name=none] [--summary]: Calculate tool-input-to-memory-usage ratio
 	meta <<-EOF
 		ADDED: 22
 		AUTHORS: natefoo
@@ -1328,12 +1328,16 @@ query_tool-input-to-memory-ratio() { ##? <tool_id> [last=-1] [--like] [--ok] [--
 
 		The optional 'last' argument can be used to limit the number of most recent jobs that will be checked.
 
-		Use the --ok option to only include jobs that finished successfully.
+		The '--ok' option includes only jobs that finished successfully.
 
-		Use the --min-used option (value in GB) to exclude memory usage less than this amount. This data can be
+		The '--min-used' option (value in GB) excludes memory usage less than this amount. This data can be
 		misleading and uselessly skew summary statistics since smaller jobs will use a baseline amount of memory
 		regardless of input size.
 
+		The '--max-input' option (value in GB) excludes input sizes larger than this amount.
+
+		The '--input-name' can be used to limit input selection to a single tool input with the given name.
+
 		Use the --summary option to output summary statistics of the ratio instead of the values themselves.
 	EOF
 
@@ -1354,6 +1358,12 @@ query_tool-input-to-memory-ratio() { ##? <tool_id> [last=-1] [--like] [--ok] [--
 	if [[ -n "$arg_ok" ]]; then
 		tool_subquery="$tool_subquery AND j.state = 'ok'"
 	fi
+	if [[ "$arg_max_input" -gt 0 ]]; then
+		tool_subquery="$tool_subquery AND d.total_size <= ${arg_max_input}::float*1024*1024*1024"
+	fi
+	if [[ "$arg_input_name" != 'none' ]]; then
+		tool_subquery="$tool_subquery AND jtid.name = '$arg_input_name'"
+	fi
 	if [[ "$arg_last" -gt 0 ]]; then
 		limit_clause="ORDER BY j.id DESC LIMIT $arg_last"
 	fi

From 21a7e9823d75e26a6b1b2e3f53c094417b1bd5d4 Mon Sep 17 00:00:00 2001
From: Nate Coraor <nate@bx.psu.edu>
Date: Wed, 15 Nov 2023 15:48:07 -0600
Subject: [PATCH 3/6] Add `--min-input` to input-to-mem query

---
 parts/22-query.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/parts/22-query.sh b/parts/22-query.sh
index 412fdb63..9de830b1 100644
--- a/parts/22-query.sh
+++ b/parts/22-query.sh
@@ -1299,7 +1299,7 @@ query_tool-metrics() { ##? <tool_id> <metric_id> [last=-1] [--like] [--ok] [--su
 	EOF
 }
 
-query_tool-input-to-memory-ratio() { ##? <tool_id> [last=-1] [--like] [--ok] [--min-used=0.5] [--max-input=-1] [--input-name=none] [--summary]: Calculate tool-input-to-memory-usage ratio
+query_tool-input-to-memory-ratio() { ##? <tool_id> [last=-1] [--like] [--ok] [--min-used=0.5] [--min-input=-1] [--max-input=-1] [--input-name=none] [--summary]: Calculate tool-input-to-memory-usage ratio
 	meta <<-EOF
 		ADDED: 22
 		AUTHORS: natefoo
@@ -1358,7 +1358,10 @@ query_tool-input-to-memory-ratio() { ##? <tool_id> [last=-1] [--like] [--ok] [--
 	if [[ -n "$arg_ok" ]]; then
 		tool_subquery="$tool_subquery AND j.state = 'ok'"
 	fi
-	if [[ "$arg_max_input" -gt 0 ]]; then
+	if [[ "$arg_min_input" != '-1' ]]; then
+		tool_subquery="$tool_subquery AND d.total_size >= ${arg_min_input}::float*1024*1024*1024"
+	fi
+	if [[ "$arg_max_input" != '-1' ]]; then
 		tool_subquery="$tool_subquery AND d.total_size <= ${arg_max_input}::float*1024*1024*1024"
 	fi
 	if [[ "$arg_input_name" != 'none' ]]; then

From d0f82df36c0854ae772341c24229b49e6648fd17 Mon Sep 17 00:00:00 2001
From: Nate Coraor <nate@bx.psu.edu>
Date: Thu, 16 Nov 2023 14:42:45 -0500
Subject: [PATCH 4/6] Update CHANGELOG.md

Co-authored-by: Helena <hexylena@galaxians.org>
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 09d6e8ec..b5fe5ce0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,7 @@
 - Added:
 	- filter histogram: replaces bit.ly's data_hacks with a built-in AWK program to calculate a histogram. May not be entirely portable @hexylena.
 	- mutate scale-table-autovacuum: Dynamically update autovacuum and autoanalyze scale for large tables. @natefoo
-	- query tool-input-to-memory-ratio: Calculate tool-input-to-memory-usage ratio
+	- query tool-input-to-memory-ratio: Calculate tool-input-to-memory-usage ratio @natefoo
 - Fixed:
 	- Replaced hardcoded metric_name with the variable in query_tool-metrics function @sanjaysrikakulam
 	- improved man pages a tad

From bbb1d9e1dda71a36d55caeafd2b8c42674f16b3c Mon Sep 17 00:00:00 2001
From: Nate Coraor <nate@bx.psu.edu>
Date: Thu, 16 Nov 2023 13:43:13 -0600
Subject: [PATCH 5/6] Cast to float to prevent out of range

---
 parts/22-query.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/parts/22-query.sh b/parts/22-query.sh
index 9de830b1..a9fb4c2f 100644
--- a/parts/22-query.sh
+++ b/parts/22-query.sh
@@ -1405,7 +1405,7 @@ query_tool-input-to-memory-ratio() { ##? <tool_id> [last=-1] [--like] [--ok] [--
 		WHERE
 			jmn.metric_name = 'memory.max_usage_in_bytes'
 			AND isz.bytes > 0
-			AND jmn.metric_value >= ${arg_min_used}*1024*1024*1024
+			AND jmn.metric_value >= ${arg_min_used}::float*1024*1024*1024
 		$order_by
 	EOF
 }

From c84728e9b7ba1ecc5cb81f2d5689e5892135c795 Mon Sep 17 00:00:00 2001
From: Nate Coraor <nate@bx.psu.edu>
Date: Thu, 12 Sep 2024 12:56:07 -0500
Subject: [PATCH 6/6] Include cgroupsv2 memory metric in
 query_tool-input-to-memory-ratio, also return size.

---
 parts/22-query.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/parts/22-query.sh b/parts/22-query.sh
index a9fb4c2f..6045bc1c 100644
--- a/parts/22-query.sh
+++ b/parts/22-query.sh
@@ -1403,7 +1403,7 @@ query_tool-input-to-memory-ratio() { ##? <tool_id> [last=-1] [--like] [--ok] [--
 		JOIN
 			input_sizes isz ON j.id = isz.job_id
 		WHERE
-			jmn.metric_name = 'memory.max_usage_in_bytes'
+			jmn.metric_name IN ('memory.max_usage_in_bytes', 'memory.peak')
 			AND isz.bytes > 0
 			AND jmn.metric_value >= ${arg_min_used}::float*1024*1024*1024
 		$order_by
@@ -2644,7 +2644,8 @@ query_job-inputs() { ##? <id>: Input datasets to a specific job
 				d.state AS d_state,
 				d.deleted AS d_deleted,
 				d.purged AS d_purged,
-				d.object_store_id AS object_store_id
+				d.object_store_id AS object_store_id,
+				d.total_size/1024/1024/1024 AS gb
 			FROM job j
 				JOIN job_to_input_dataset jtid
 					ON j.id = jtid.job_id