Skip to content

Commit

Permalink
Nvidia monitor fixes (#239)
Browse files Browse the repository at this point in the history
* Update output parsing for nvidiamon

* Update test files for nvidia-smi parsing

Update to the new nvidia-smi pmon output fields
Add a pycuda GPU burner script for tests

* Parse ccpm field as string

This can be a "-" instead of 0

* Update precook script and precooked ouptuts

Ensure precooked values are fixed to what we want

* Fix hash-bang an mode on GPU burner

* Python reformatting

With latest verisons of black and flake8

There is one import in gpu-burner.py that is needed (pycuda.autoinit)
as it has side effects, so this is marked as excempt for flake8

---------

Co-authored-by: Johannes Elmsheuser <[email protected]>
Co-authored-by: Graeme Stewart <[email protected]>
  • Loading branch information
3 people authored May 3, 2024
1 parent 05939d1 commit 0696123
Show file tree
Hide file tree
Showing 8 changed files with 65 additions and 17 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -254,4 +254,4 @@ to CMake using `Gperftools_ROOT_DIR`.

# Copyright

Copyright (c) 2018-2023 CERN.
Copyright (c) 2018-2024 CERN.
41 changes: 41 additions & 0 deletions package/scripts/gpu-burner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#! /usr/bin/env python3
#
# This is a slightly adapted "hello, world" script from
# pycuda, that can be used for stressing a CUDA GPU for
# tests
#
# pycuda is required!
#

import pycuda.autoinit # noqa: F401
import pycuda.driver as drv
import numpy
from time import time

from pycuda.compiler import SourceModule

mod = SourceModule(
"""
__global__ void multiply_them(float *dest, float *a, float *b, float *c)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i] + c[i];
}
"""
)

multiply_them = mod.get_function("multiply_them")

a = numpy.random.randn(1024).astype(numpy.float32)
b = numpy.random.randn(1024).astype(numpy.float32)
c = numpy.random.randn(1024).astype(numpy.float32)

dest = numpy.zeros_like(a)

start = time()
while time() - start < 20:
multiply_them(
drv.Out(dest), drv.In(a), drv.In(b), drv.In(c), block=(1024, 1, 1), grid=(1, 1)
)

print(dest - a * b + c)
15 changes: 10 additions & 5 deletions package/scripts/precook_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,19 +95,24 @@ def make_net(proc_net, fixed_value, rand=False):

def make_nvidia(proc_nvidia, fixed_value, rand=False):
# idx
print(proc_nvidia, fixed_value, rand)
smi_fname = os.path.join(proc_nvidia, "smi")
pct_lim = 100
memory_lim = 10000
with open(smi_fname, "w") as f:
params = [
0, # idx
pid, # pid
"G", # type
random.randint(0, memory_lim) if rand else fixed_value, # sm
random.randint(0, memory_lim) if rand else fixed_value, # mem
# enc, dec are not monitored metrics
0, # enc
0, # dec
random.randint(0, pct_lim) if rand else fixed_value, # sm
random.randint(0, pct_lim) if rand else fixed_value, # mem
# The following are not monitored metrics
"-", # enc
"-", # dec
"-", # jpg
"-", # ofa
random.randint(0, memory_lim) if rand else fixed_value, # fb
0, # ccpm
"python3", # command
]
for param in params:
Expand Down
2 changes: 1 addition & 1 deletion package/scripts/precooked_tests/drop/1/nvidia/smi
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0 1729 G 50 50 0 0 50 python3
0 1729 G 50 50 - - - - 50 0 python3
2 changes: 1 addition & 1 deletion package/scripts/precooked_tests/drop/2/nvidia/smi
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0 1729 G 100 100 0 0 100 python3
0 1729 G 100 100 - - - - 100 0 python3
2 changes: 1 addition & 1 deletion package/scripts/precooked_tests/drop/3/nvidia/smi
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0 1729 G 20 20 0 0 20 python3
0 1729 G 20 20 - - - - 20 0 python3
8 changes: 5 additions & 3 deletions package/scripts/prmon_compress_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,11 @@ def main():

parser.add_argument(
"--precision",
type=lambda x: float(x)
if 0 < float(x) < 1
else parser.exit(-1, "Precision must be strictly between 0 and 1"),
type=lambda x: (
float(x)
if 0 < float(x) < 1
else parser.exit(-1, "Precision must be strictly between 0 and 1")
),
default=0.05,
help="precision value for interpolation threshold",
)
Expand Down
10 changes: 5 additions & 5 deletions package/src/nvidiamon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,20 +76,20 @@ void nvidiamon::update_stats(const std::vector<pid_t>& pids,
// Loop over output
unsigned int gpu_idx{}, sm{}, mem{}, fb_mem{};
pid_t pid{};
std::string enc{}, dec{}, cg_type{}, cmd_name{};
std::string enc{}, dec{}, jpg{}, ofa{}, cg_type{}, ccpm{}, cmd_name{};
std::unordered_map<unsigned int, bool>
activegpus{}; // Avoid double counting active GPUs
for (const auto& s : cmd_result.second) {
if (s[0] == '#') continue;
std::istringstream instr(s);
instr >> gpu_idx >> pid >> cg_type >> sm >> mem >> enc >> dec >> fb_mem >>
cmd_name;
instr >> gpu_idx >> pid >> cg_type >> sm >> mem >> enc >> dec >> jpg >> ofa >> fb_mem >>
ccpm >> cmd_name;
auto read_ok = !(instr.fail() || instr.bad()); // eof() is ok
if (read_ok) {
if (log_level <= spdlog::level::debug) {
std::stringstream strm;
strm << "Good read: " << gpu_idx << " " << pid << " " << cg_type << " "
<< sm << " " << mem << " " << enc << " " << dec << " " << fb_mem
<< sm << " " << mem << " " << enc << " " << dec << " " << jpg << " " << ofa << " " << fb_mem << " " << ccpm
<< " " << cmd_name << std::endl;
debug(strm.str());
}
Expand All @@ -115,7 +115,7 @@ void nvidiamon::update_stats(const std::vector<pid_t>& pids,
std::stringstream strm;
strm << "Bad read of line: " << s << std::endl;
strm << "Parsed to: " << gpu_idx << " " << pid << " " << cg_type << " "
<< sm << " " << mem << " " << enc << " " << dec << " " << fb_mem
<< sm << " " << mem << " " << enc << " " << dec << " " << jpg << " " << ofa << " " << fb_mem << " " << ccpm
<< " " << cmd_name << std::endl;

strm << "StringStream status: good()=" << instr.good();
Expand Down

0 comments on commit 0696123

Please sign in to comment.