-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adds slurm and fss monitoring files. #14
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
#!/bin/bash | ||
|
||
# TODO switch to duc #13 | ||
sudo du -sh /data/* | sort -rh > ~/logs/fss_du.log |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
import datetime | ||
import json | ||
import subprocess | ||
from pathlib import Path | ||
|
||
|
||
#Example command we want to extract | ||
#sacct -a -S 2023-02-28-12:00:00 -E 2023-02-28-13:00:00 -P --noconvert --noheader -o 'user,JobID,Account,NCPUs,ReqMem,elapsedraw,cputimeraw,alloctres' | ||
|
||
date_time_list = [] | ||
job_dict = { | ||
"job_ids": {}, | ||
"dates": {}, | ||
"users": {}, | ||
} | ||
save_file = "/home/opc/logs/slurm_usage.json" | ||
|
||
def roundTime(dt=None, roundTo=60): | ||
""" | ||
Round a datetime object to any time lapse in seconds. | ||
Source: https://stackoverflow.com/questions/3463930/how-to-round-the-minute-of-a-datetime-object/10854034#10854034 | ||
Author: Thierry Husson 2012 - Use it as you want but don't blame me. | ||
Args: | ||
dt : datetime.datetime object, default now. | ||
roundTo : Closest number of seconds to round to, default 1 minute. | ||
Returns: | ||
datetime object rounded to whatever value. | ||
""" | ||
if dt == None : dt = datetime.datetime.now() | ||
seconds = (dt.replace(tzinfo=None) - dt.min).seconds | ||
rounding = (seconds+roundTo/2) // roundTo * roundTo | ||
return dt + datetime.timedelta(0,rounding-seconds,-dt.microsecond) | ||
|
||
now = roundTime() | ||
end = now - datetime.timedelta(days=30) | ||
|
||
while now > end: | ||
date_time_list.append(now.strftime("%Y-%m-%d-%H:%M:%S")) | ||
now = now - datetime.timedelta(hours=1) | ||
|
||
def add_or_remove_to_dict(returned_val, date_time_list, index, add=False, remove=False): | ||
""" | ||
Updates the global job_dict to either add or remove elements. | ||
Such as jobs that were run, total gpu usage, and mapping of users to gpu usage. | ||
Args: | ||
returned_val: string of what sacct shell command returns. | ||
date_time_list: list of datetimes in the proper format e.g. strftime("%Y-%m-%d-%H:%M:%S") | ||
index: int which element of date_time_list to use. | ||
add: bool whether to add elements to job_dict. | ||
remove: bool whether to remove elements from job_dict. | ||
|
||
""" | ||
if add == remove: | ||
print("Error in use of add or remove.") | ||
exit() | ||
|
||
total_gpus = 0 | ||
for item in returned_val: | ||
if not item: | ||
continue | ||
slurm_data = item.split("|") | ||
user = slurm_data[0] | ||
job_id = slurm_data[1] | ||
account = slurm_data[2] | ||
n_cpus = int(slurm_data[3]) | ||
req_mem = slurm_data[4] | ||
elapsed_time = int(slurm_data[5]) | ||
alloc_tres = slurm_data[7] | ||
alloc_tres = alloc_tres.split(",") | ||
# ignore the cpu jobs on the cluster for now | ||
if len(alloc_tres) < 4: | ||
continue | ||
num_gpus = int(alloc_tres[2][alloc_tres[2].find("=")+1:]) | ||
num_nodes = int(alloc_tres[3][alloc_tres[3].find("=")+1:]) | ||
total_gpus += num_gpus | ||
if add: | ||
job_dict["job_ids"][job_id] = { | ||
"user": user, | ||
"job_id": job_id, | ||
"account": account, | ||
"n_cpus": n_cpus, | ||
"req_mem": req_mem, | ||
"elapsed_time": elapsed_time, | ||
"num_gpus": num_gpus, | ||
"num_nodes": num_nodes, | ||
} | ||
user_stats = job_dict["users"].get(user) | ||
if user_stats is None: | ||
job_dict["users"][user] = elapsed_time * num_gpus | ||
else: | ||
job_dict["users"][user] += elapsed_time * num_gpus | ||
elif remove: | ||
del job_dict["job_ids"][job_id] | ||
user_stats = job_dict["users"].get(user) | ||
if user_stats is not None: | ||
job_dict["users"][user] -= elapsed_time * num_gpus | ||
if job_dict["users"][user] < 0: | ||
del job_dict["users"][user] | ||
if add: | ||
job_dict["dates"][date_time_list[index]] = total_gpus | ||
elif remove: | ||
del job_dict["dates"][date_time_list[index]] | ||
|
||
my_file = Path(save_file) | ||
|
||
# if it exists just update the latest one and remove the old one. | ||
if my_file.is_file(): | ||
# Remove old entries. | ||
older_end = (end - datetime.timedelta(hours=1)).strftime("%Y-%m-%d-%H:%M:%S") | ||
command = ("sacct -a -S {} ".format(older_end) + | ||
"-E {} -P ".format(end.strftime("%Y-%m-%d-%H:%M:%S")) + | ||
"--noconvert --noheader " | ||
"-o 'user,JobID,Account,NCPUs,ReqMem,elapsedraw,cputimeraw,alloctres' " | ||
) | ||
returned_val = subprocess.check_output(command, shell=True).decode("utf-8").split("\n")[::3] | ||
add_or_remove_to_dict(returned_val, [older_end], 0, remove=True) | ||
|
||
# Add new entries. | ||
hour_less_than_now = (now - datetime.timedelta(hours=1)).strftime("%Y-%m-%d-%H:%M:%S") | ||
command = ("sacct -a -S {} ".format(hour_less_than_now) + | ||
"-E {} -P ".format(now.strftime("%Y-%m-%d-%H:%M:%S")) + | ||
"--noconvert --noheader " | ||
"-o 'user,JobID,Account,NCPUs,ReqMem,elapsedraw,cputimeraw,alloctres' " | ||
) | ||
returned_val = subprocess.check_output(command, shell=True).decode("utf-8").split("\n")[::3] | ||
|
||
add_or_remove_to_dict(returned_val, [hour_less_than_now], 0, add=True) | ||
|
||
# Otherwise just grab all of the entries for the past month. | ||
else: | ||
for index, date_time_val in enumerate(date_time_list[:-1]): | ||
command = ("sacct -a -S {} ".format(date_time_list[index+1]) + | ||
"-E {} -P ".format(date_time_list[index]) + | ||
"--noconvert --noheader " | ||
"-o 'user,JobID,Account,NCPUs,ReqMem,elapsedraw,cputimeraw,alloctres' " | ||
) | ||
|
||
returned_val = subprocess.check_output(command, shell=True).decode("utf-8") | ||
# sacct returns almost triplicate results so instead only select just one. | ||
returned_val = returned_val.split("\n")[::3] | ||
|
||
add_or_remove_to_dict(returned_val, date_time_list, index+1, add=True) | ||
|
||
with open(save_file, "w") as f: | ||
json.dump(job_dict, f, indent=4) |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Old what? Do you mean the old entries?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll update the comment to just say instead