Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implemented simple monitor disabling #178

Merged
merged 6 commits into from
Nov 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ jobs:
# Sets up useful environment variables
- name: Setup environment variables
run: |
echo "::set-env name=DIMAGE::hepsoftwarefoundation/${PLATFORM/_py2/}"
echo "DIMAGE=hepsoftwarefoundation/${PLATFORM/_py2/}" >> $GITHUB_ENV
if [[ "$PLATFORM" == *_py2 ]];
then
echo "::set-env name=CMAKE_EXTRA::-DPYTHON_TEST=python2"
echo "CMAKE_EXTRA=-DPYTHON_TEST=python2" >> $GITHUB_ENV
fi
env:
PLATFORM: ${{ matrix.platform }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/flake8.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
# Sets up useful environment variables
- name: Setup environment variables
run: |
echo "::set-env name=DIMAGE::hepsoftwarefoundation/u20-dev"
echo "DIMAGE=hepsoftwarefoundation/u20-dev" >> $GITHUB_ENV
env:
PLATFORM: u20-dev

Expand Down
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ The `prmon` binary is invoked with the following arguments:
```sh
prmon [--pid PPP] [--filename prmon.txt] [--json-summary prmon.json] \
[--interval 30] [--suppress-hw-info] [--units] [--netdev DEV] \
[--disable MON1] \
[-- prog arg arg ...]
```

Expand All @@ -93,6 +94,9 @@ prmon [--pid PPP] [--filename prmon.txt] [--json-summary prmon.json] \
* `--suppress-hw-info` flag that turns-off hardware information collection
* `--units` add information on units for each metric to JSON file
* `--netdev` restricts network statistics to one (or more) network devices
* `--disable` is used to disable specific monitors (and can be specified multiple times);
the default is that `prmon` monitors everything that it can
* Note that the `wallmon` monitor is the only monitor that cannot be disabled
* `--` after this argument the following arguments are treated as a program to invoke
and remaining arguments are passed to it; `prmon` will then monitor this process
instead of being given a PID via `--pid`
Expand All @@ -101,13 +105,15 @@ prmon [--pid PPP] [--filename prmon.txt] [--json-summary prmon.json] \
incomplete arguments. If `prmon` starts a program itself (using `--`) then
`prmon` will exit with the exit code of the child process.

When invoked with `-h` or `--help` usage information is printed, as well as a
list of all available monitoring components.

## Outputs

In the `filename` output file, plain text with statistics written every
`interval` seconds are written. The first line gives the column names.

In the `json-summmary` file values for the maximum and average statistics
In the `json-summary` file values for the maximum and average statistics
are given in JSON format. This file is rewritten every `interval` seconds
with the current summary values. Use the `--units` option to see exactly
which units are used for each metric (the value of `1` for a unit means
Expand Down
54 changes: 39 additions & 15 deletions package/src/prmon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,27 +32,36 @@ bool prmon::sigusr1 = false;
int ProcessMonitor(const pid_t mpid, const std::string filename,
const std::string json_summary_file, const time_t interval,
const bool store_hw_info, const bool store_unit_info,
const std::vector<std::string> netdevs) {
const std::vector<std::string> netdevs,
const std::vector<std::string> disabled_monitors) {
signal(SIGUSR1, prmon::SignalCallbackHandler);

// This is the vector of all monitoring components
std::unordered_map<std::string, std::unique_ptr<Imonitor>> monitors{};

auto registered_monitors = registry::Registry<Imonitor>::list_registered();
for (const auto& class_name : registered_monitors) {
std::unique_ptr<Imonitor> new_monitor_p(
registry::Registry<Imonitor>::create(class_name));
if (new_monitor_p) {
if (new_monitor_p->is_valid()) {
monitors[class_name] = std::move(new_monitor_p);
// Check if the monitor should be enabled
bool state = true;
for (const auto& disabled : disabled_monitors) {
if (class_name == disabled) state = false;
}
if (state) {
std::unique_ptr<Imonitor> new_monitor_p(
registry::Registry<Imonitor>::create(class_name));
if (new_monitor_p) {
if (new_monitor_p->is_valid()) {
monitors[class_name] = std::move(new_monitor_p);
}
} else {
std::cerr << "Registration of monitor " << class_name << " FAILED"
<< std::endl;
}
} else {
std::cerr << "Registration of monitor " << class_name << " FAILED"
<< std::endl;
}
}
// The wallclock monitor is always needed as it is used for average stat
// generation
// generation - wallmon cannot be disabled, but this is prechecked in
// prmon::valid_monitor_disable before we get here
if (monitors.count("wallmon") != 1) {
std::cerr << "Failed to initialise mandatory wallclock monitoring class"
<< std::endl;
Expand Down Expand Up @@ -222,6 +231,7 @@ int main(int argc, char* argv[]) {
std::string filename{default_filename};
std::string jsonSummary{default_json_summary};
std::vector<std::string> netdevs{};
std::vector<std::string> disabled_monitors{};
unsigned int interval{default_interval};
bool store_hw_info{default_store_hw_info};
bool store_unit_info{default_store_unit_info};
Expand All @@ -232,15 +242,16 @@ int main(int argc, char* argv[]) {
{"filename", required_argument, NULL, 'f'},
{"json-summary", required_argument, NULL, 'j'},
{"interval", required_argument, NULL, 'i'},
{"disable", required_argument, NULL, 'd'},
{"suppress-hw-info", no_argument, NULL, 's'},
{"units", no_argument, NULL, 'u'},
{"netdev", required_argument, NULL, 'n'},
{"help", no_argument, NULL, 'h'},
{0, 0, 0, 0}};

int c;
while ((c = getopt_long(argc, argv, "-p:f:j:i:sun:h", long_options, NULL)) !=
-1) {
while ((c = getopt_long(argc, argv, "-p:f:j:i:d:sun:h", long_options,
NULL)) != -1) {
switch (char(c)) {
case 'p':
pid = std::stoi(optarg);
Expand All @@ -264,6 +275,11 @@ int main(int argc, char* argv[]) {
case 'n':
netdevs.push_back(optarg);
break;
case 'd':
// Check we got a valid monitor name
if (prmon::valid_monitor_disable(optarg))
disabled_monitors.push_back(optarg);
break;
case 'h':
do_help = 1;
break;
Expand Down Expand Up @@ -296,6 +312,11 @@ int main(int argc, char* argv[]) {
<< (default_store_unit_info ? "true" : "false") << ")\n"
<< "[--netdev, -n dev] Network device to monitor (can be given\n"
<< " multiple times; default ALL devices)\n"
<< "[--disable, -d mon] Disable monitor component\n"
<< " (can be given multiple times);\n"
<< " all monitors enabled by default\n"
<< " Special name '[~]all' sets default "
"state\n"
<< "[--] prog [arg] ... Instead of monitoring a PID prmon will\n"
<< " execute the given program + args and\n"
<< " monitor this (must come after other \n"
Expand All @@ -311,6 +332,8 @@ int main(int argc, char* argv[]) {
<< std::endl;
}
std::cout << std::endl;
std::cout << "More information: https://github.com/HSF/prmon\n"
<< std::endl;
return 0;
}

Expand All @@ -329,7 +352,7 @@ int main(int argc, char* argv[]) {
else
std::cerr << "found none";
std::cerr << std::endl;
return 0;
return EXIT_FAILURE;
}

if (got_pid) {
Expand All @@ -338,7 +361,7 @@ int main(int argc, char* argv[]) {
return 1;
}
ProcessMonitor(pid, filename, jsonSummary, interval, store_hw_info,
store_unit_info, netdevs);
store_unit_info, netdevs, disabled_monitors);
} else {
if (child_args == argc) {
std::cerr << "Found marker for child program to execute, but with no "
Expand All @@ -350,7 +373,8 @@ int main(int argc, char* argv[]) {
execvp(argv[child_args], &argv[child_args]);
} else if (child > 0) {
return ProcessMonitor(child, filename, jsonSummary, interval,
store_hw_info, store_unit_info, netdevs);
store_hw_info, store_unit_info, netdevs,
disabled_monitors);
}
}

Expand Down
22 changes: 22 additions & 0 deletions package/src/prmonutils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
#include <iostream>
#include <sstream>

#include "Imonitor.h"
#include "registry.h"

namespace prmon {

bool kernel_proc_pid_test(const pid_t pid) {
Expand Down Expand Up @@ -117,4 +120,23 @@ int reap_children() {
return return_code;
}

// Check if a request to disable a monitor is valid or not
bool valid_monitor_disable(const std::string disable_name) {
// First check this is not an attempt to disable the wallmon
if (disable_name == "wallmon") {
std::cerr << "prmon: wallmon monitor cannot be disabled (ignored)"
<< std::endl;
return false;
}
auto monitors = registry::Registry<Imonitor>::list_registered();
for (const auto& monitor_name : monitors) {
if (monitor_name == disable_name) {
return true;
}
}
std::cerr << "prmon: " << disable_name
<< " is an invalid monitor name (ignored)" << std::endl;
return false;
}

} // namespace prmon
4 changes: 4 additions & 0 deletions package/src/prmonutils.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <sys/wait.h>
#include <unistd.h>

#include <string>
#include <vector>

namespace prmon {
Expand All @@ -20,6 +21,9 @@ bool kernel_proc_pid_test(const pid_t pid);
std::vector<pid_t> pstree_pids(const pid_t mother_pid);
std::vector<pid_t> offspring_pids(const pid_t mother_pid);

// Switch on/off states for individual monitors
bool valid_monitor_disable(const std::string disable_name);

// Signal handlers
extern bool sigusr1;
void SignalCallbackHandler(int);
Expand Down
4 changes: 4 additions & 0 deletions package/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ script_install(SCRIPT net_burner.py)
script_install(SCRIPT http_block.py DESTINATION cgi-bin)
script_install(SCRIPT test_count.py)
script_install(SCRIPT test_exit.py)
script_install(SCRIPT test_disable.py)

# Setup the target version of Python we will use for testing
set(PYTHON_TEST "python3" CACHE STRING "Python binary to use for tests")
Expand Down Expand Up @@ -83,3 +84,6 @@ add_test(NAME testUnits COMMAND ${PYTHON_TEST} test_cpu.py --units --time 3 --sl

# Test passing the child exit code works
add_test(NAME testExitCode COMMAND ${PYTHON_TEST} test_exit.py --exit-code 43)

# Test for disabling monitor components
add_test(NAME testDisable COMMAND ${PYTHON_TEST} test_disable.py --disable iomon countmon)
73 changes: 73 additions & 0 deletions package/tests/test_disable.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#! /usr/bin/env python3
#
# Copyright (C) 2020 CERN
# License Apache2 - see LICENCE file

"""prmon test harness to check monitors can be disabled correctly"""

import argparse
import json
import subprocess
import sys
import unittest


def setup_configurable_test(disable=[]):
"""Wrap the class definition in a function to allow arguments to be passed"""

class ConfigurableProcessMonitor(unittest.TestCase):
"""Test class for a specific set of parameters"""

def test_run_test_with_params(self):
"""Actual test runner"""
burn_cmd = ["./burner", "--time", "3"]

prmon_cmd = ["../prmon", "--interval", "1"]
for d in disable:
prmon_cmd.extend(("-d", d))
prmon_cmd.append("--")
prmon_cmd.extend(burn_cmd)
prmon_p = subprocess.Popen(prmon_cmd, shell=False)

prmon_rc = prmon_p.wait()

self.assertEqual(prmon_rc, 0, "Non-zero return code from prmon")

with open("prmon.json") as infile:
prmon_json = json.load(infile)

# Check we don't have fields corresponding to disabled monitors
monitor_fields = {
"countmon": "nprocs",
"cpumon": "stime",
"iomon": "rchar",
"memmon": "vmem",
"netmon": "rx_bytes",
"nvidiamon": "ngpus",
}
for d in disable:
if d in monitor_fields:
self.assertFalse(monitor_fields[d] in prmon_json["Max"])

return ConfigurableProcessMonitor


def main_parse_args_and_get_test():
"""Parse arguments and call test class generator
returning the test case (which is unusual for a
main() function)"""
parser = argparse.ArgumentParser(description="Configurable test runner")
parser.add_argument("--disable", nargs="+")

args = parser.parse_args()
# Stop unittest from being confused by the arguments
sys.argv = sys.argv[:1]

return setup_configurable_test(args.disable)


if __name__ == "__main__":
# As unitest will only run tests in the global namespace
# we return the test instance from main()
the_test = main_parse_args_and_get_test()
unittest.main()