Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds slurm and fss monitoring files. #14

Merged
merged 1 commit into from
Mar 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions samples/monitoring/fss_usage.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

# TODO switch to duc #13
sudo du -sh /data/* | sort -rh > ~/logs/fss_du.log
145 changes: 145 additions & 0 deletions samples/monitoring/slurm_accounting_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import datetime
import json
import subprocess
from pathlib import Path


#Example command we want to extract
#sacct -a -S 2023-02-28-12:00:00 -E 2023-02-28-13:00:00 -P --noconvert --noheader -o 'user,JobID,Account,NCPUs,ReqMem,elapsedraw,cputimeraw,alloctres'

date_time_list = []
job_dict = {
"job_ids": {},
"dates": {},
"users": {},
}
save_file = "/home/opc/logs/slurm_usage.json"

def roundTime(dt=None, roundTo=60):
"""
Round a datetime object to any time lapse in seconds.
Source: https://stackoverflow.com/questions/3463930/how-to-round-the-minute-of-a-datetime-object/10854034#10854034
Author: Thierry Husson 2012 - Use it as you want but don't blame me.
Args:
dt : datetime.datetime object, default now.
roundTo : Closest number of seconds to round to, default 1 minute.
Returns:
datetime object rounded to whatever value.
"""
if dt == None : dt = datetime.datetime.now()
seconds = (dt.replace(tzinfo=None) - dt.min).seconds
rounding = (seconds+roundTo/2) // roundTo * roundTo
return dt + datetime.timedelta(0,rounding-seconds,-dt.microsecond)

now = roundTime()
end = now - datetime.timedelta(days=30)

while now > end:
date_time_list.append(now.strftime("%Y-%m-%d-%H:%M:%S"))
now = now - datetime.timedelta(hours=1)

def add_or_remove_to_dict(returned_val, date_time_list, index, add=False, remove=False):
"""
Updates the global job_dict to either add or remove elements.
Such as jobs that were run, total gpu usage, and mapping of users to gpu usage.
Args:
returned_val: string of what sacct shell command returns.
date_time_list: list of datetimes in the proper format e.g. strftime("%Y-%m-%d-%H:%M:%S")
index: int which element of date_time_list to use.
add: bool whether to add elements to job_dict.
remove: bool whether to remove elements from job_dict.

"""
if add == remove:
print("Error in use of add or remove.")
exit()

total_gpus = 0
for item in returned_val:
if not item:
continue
slurm_data = item.split("|")
user = slurm_data[0]
job_id = slurm_data[1]
account = slurm_data[2]
n_cpus = int(slurm_data[3])
req_mem = slurm_data[4]
elapsed_time = int(slurm_data[5])
alloc_tres = slurm_data[7]
alloc_tres = alloc_tres.split(",")
# ignore the cpu jobs on the cluster for now
if len(alloc_tres) < 4:
continue
num_gpus = int(alloc_tres[2][alloc_tres[2].find("=")+1:])
num_nodes = int(alloc_tres[3][alloc_tres[3].find("=")+1:])
total_gpus += num_gpus
if add:
job_dict["job_ids"][job_id] = {
"user": user,
"job_id": job_id,
"account": account,
"n_cpus": n_cpus,
"req_mem": req_mem,
"elapsed_time": elapsed_time,
"num_gpus": num_gpus,
"num_nodes": num_nodes,
}
user_stats = job_dict["users"].get(user)
if user_stats is None:
job_dict["users"][user] = elapsed_time * num_gpus
else:
job_dict["users"][user] += elapsed_time * num_gpus
elif remove:
del job_dict["job_ids"][job_id]
user_stats = job_dict["users"].get(user)
if user_stats is not None:
job_dict["users"][user] -= elapsed_time * num_gpus
if job_dict["users"][user] < 0:
del job_dict["users"][user]
if add:
job_dict["dates"][date_time_list[index]] = total_gpus
elif remove:
del job_dict["dates"][date_time_list[index]]

my_file = Path(save_file)

# if it exists just update the latest one and remove the old one.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Old what? Do you mean the old entries?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll update the comment to just say instead

Update the file with the latest entries if it exists.

if my_file.is_file():
# Remove old entries.
older_end = (end - datetime.timedelta(hours=1)).strftime("%Y-%m-%d-%H:%M:%S")
command = ("sacct -a -S {} ".format(older_end) +
"-E {} -P ".format(end.strftime("%Y-%m-%d-%H:%M:%S")) +
"--noconvert --noheader "
"-o 'user,JobID,Account,NCPUs,ReqMem,elapsedraw,cputimeraw,alloctres' "
)
returned_val = subprocess.check_output(command, shell=True).decode("utf-8").split("\n")[::3]
add_or_remove_to_dict(returned_val, [older_end], 0, remove=True)

# Add new entries.
hour_less_than_now = (now - datetime.timedelta(hours=1)).strftime("%Y-%m-%d-%H:%M:%S")
command = ("sacct -a -S {} ".format(hour_less_than_now) +
"-E {} -P ".format(now.strftime("%Y-%m-%d-%H:%M:%S")) +
"--noconvert --noheader "
"-o 'user,JobID,Account,NCPUs,ReqMem,elapsedraw,cputimeraw,alloctres' "
)
returned_val = subprocess.check_output(command, shell=True).decode("utf-8").split("\n")[::3]

add_or_remove_to_dict(returned_val, [hour_less_than_now], 0, add=True)

# Otherwise just grab all of the entries for the past month.
else:
for index, date_time_val in enumerate(date_time_list[:-1]):
command = ("sacct -a -S {} ".format(date_time_list[index+1]) +
"-E {} -P ".format(date_time_list[index]) +
"--noconvert --noheader "
"-o 'user,JobID,Account,NCPUs,ReqMem,elapsedraw,cputimeraw,alloctres' "
)

returned_val = subprocess.check_output(command, shell=True).decode("utf-8")
# sacct returns almost triplicate results so instead only select just one.
returned_val = returned_val.split("\n")[::3]

add_or_remove_to_dict(returned_val, date_time_list, index+1, add=True)

with open(save_file, "w") as f:
json.dump(job_dict, f, indent=4)