Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding hardware usage and software packages tracker #2195

Merged
merged 25 commits into from
Jul 15, 2022
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
068f710
adding hardware usage and software packages tracker
Jun 27, 2022
c3bcd5b
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 30, 2022
dbf75d2
removed stdout redirection to null during import
Jun 30, 2022
53871f9
remove sys.stdout
Jun 30, 2022
44c574c
reverting
Jul 6, 2022
97b8ef3
updated `tracker.py`
Jul 12, 2022
f2b859a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 12, 2022
ccbffce
improved docstring style
Jul 12, 2022
e78a582
removing unnecessary `torch.cuda.synchronize()` call
Jul 12, 2022
d3f4fe7
using the `multiprocessing` library instead of the `@processify` wrap…
Jul 12, 2022
53ae6ad
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 12, 2022
27133d8
style changes
Jul 12, 2022
4dc3347
adding s3fs to `requirements.txt`
Jul 12, 2022
4bc606f
name change to `resource_usage_tracker.py`
Jul 14, 2022
f270ba4
added test
Jul 14, 2022
cf4c93c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 14, 2022
e285481
tag name validation
Jul 14, 2022
8807e05
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 14, 2022
3bad0e3
flake8 updates
Jul 14, 2022
a02e0bd
fixed test file
Jul 14, 2022
4b7f7dc
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 14, 2022
2f09fd4
update test file
Jul 15, 2022
fe622d0
fixing empty utilization (due to very short experiment)
Jul 15, 2022
a1fd13d
added # noqa E402
Jul 15, 2022
c18b6bd
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 15, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
188 changes: 188 additions & 0 deletions ludwig/benchmarking/tracker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
"""some parts are inspired from https://github.com/Breakend/experiment-impact-
tracker/blob/master/experiment_impact_tracker/compute_tracker.py."""

import multiprocessing
import os
import shutil
import sys
import time
import traceback
from queue import Empty as EmptyQueueException
from statistics import mean
from typing import Any, Dict, Optional

import psutil
import torch
from gpustat.core import GPUStatCollection

from ludwig.globals import LUDWIG_VERSION
from ludwig.utils.data_utils import load_json, save_json

# disabling print because the following imports are verbose
f = open(os.devnull, "w")
sys.stdout = f
from experiment_impact_tracker.cpu.common import get_my_cpu_info
from experiment_impact_tracker.gpu.nvidia import get_gpu_info
from experiment_impact_tracker.py_environment.common import get_python_packages_and_versions

f.close()
sys.stdout = sys.__stdout__

STOP_MESSAGE = "stop"


def monitor(queue: multiprocessing.Queue, info: Dict[str, Any], output_dir: str, logging_interval: int) -> None:
"""Monitors hardware resource use as part of a separate process.

Populate `info` with system specific metrics (GPU, CPU, RAM) at a `logging_interval` interval and saves the output
in `output_dir`.

Args:
queue: queue from which we can push and retrieve messages sent to the child process.
info: dictionary containing system resource usage information about the parent process.
output_dir: directory where the contents of `info` will be saved.
logging_interval: time interval at which we will poll the system for usage metrics.
"""
for key in info["system"]:
if "gpu_" in key:
info["system"][key]["memory_used"] = []
info["system"]["cpu_utilization"] = []
info["system"]["ram_utilization"] = []

while True:
try:
message = queue.get(block=False)
if isinstance(message, str):
if message == STOP_MESSAGE:
save_json(os.path.join(output_dir, info["tag"] + "_temp.json"), info)
return
else:
queue.put(message)
except EmptyQueueException:
pass
if torch.cuda.is_available():
gpu_infos = GPUStatCollection.new_query()
for i, gpu_info in enumerate(gpu_infos):
gpu_key = f"gpu_{i}"
info["system"][gpu_key]["memory_used"].append(gpu_info.memory_used)
info["system"]["cpu_utilization"].append(psutil.cpu_percent())
info["system"]["ram_utilization"].append(psutil.virtual_memory().percent)
time.sleep(logging_interval)


class Tracker:
abidwael marked this conversation as resolved.
Show resolved Hide resolved
"""Track system resource (hardware and software) usage by a chunk of code.
abidwael marked this conversation as resolved.
Show resolved Hide resolved

Attributes:
tag: a string tag about the process that we're tracking. Examples: train, evaluate, preprocess, etc.
output_dir: path where metrics are saved.
logging_interval: time interval in seconds at which system is polled for resource usage.
num_batches: number of batches of training or evaluation process.
num_batches: number of examples of training or evaluation process.
abidwael marked this conversation as resolved.
Show resolved Hide resolved
"""

def __init__(
self,
tag: str,
output_dir: str,
logging_interval: int = 1,
num_batches: Optional[int] = None,
num_examples: Optional[int] = None,
) -> None:
self.output_dir = output_dir
self.tag = tag
self.info = {"tag": self.tag, "system": {}}
self.num_batches = num_batches
self.num_examples = num_examples
self.logging_interval = logging_interval
self.launched = False
os.makedirs(os.path.join(self.output_dir), exist_ok=True)

def populate_static_information(self) -> None:
"""Populates the report with static software and hardware information."""
self.info["ludwig_version"] = LUDWIG_VERSION
self.info["start_disk_usage"] = shutil.disk_usage(os.path.expanduser("~")).used

# CPU information
self.info["system"]["python_packages_and_versions"] = [
str(package) for package in get_python_packages_and_versions()
]
cpu_info = get_my_cpu_info()
self.info["system"]["cpu_architecture"] = cpu_info["arch"]
self.info["system"]["num_cpu"] = cpu_info["count"]
self.info["system"]["cpu_name"] = cpu_info["brand_raw"]

# GPU information
if torch.cuda.is_available():
gpu_infos = get_gpu_info()
for i, gpu_info in enumerate(gpu_infos):
gpu_key = f"gpu_{i}"
self.info["system"][gpu_key] = {}
self.info["system"][gpu_key]["name"] = gpu_info["name"]
self.info["system"][gpu_key]["total_memory"] = gpu_info["total_memory"]
self.info["system"][gpu_key]["driver_version"] = gpu_info["driver_version"]
self.info["system"][gpu_key]["cuda_version"] = gpu_info["cuda_version"]

self.info["start_time"] = time.time()
self.info["num_examples"] = self.num_examples

def __enter__(self):
"""Populates static information and forks process to monitor resource usage."""
if self.launched:
raise ValueError("Tracker already launched.")

self.populate_static_information()
try:
ctx = multiprocessing.get_context("fork")
self.queue = ctx.Queue()
self.p = ctx.Process(
target=monitor,
args=(
self.queue,
self.info,
self.output_dir,
self.logging_interval,
),
)
self.p.start()
self.launched = True
except Exception as _:
self.launched = False
ex_type, ex_value, tb = sys.exc_info()
print("Encountered exception when launching tracker.")
print("".join(traceback.format_tb(tb)))
raise

return self

def __exit__(self, exc_type, exc_val, exc_tb) -> None:
"""Waits for monitoring process to exit.

Computes and postprocesses more metrics. Saves report.
"""
self.queue.put(STOP_MESSAGE)
if torch.cuda.is_available():
torch.cuda.synchronize()
self.p.join()

self.info = load_json(os.path.join(self.output_dir, self.info["tag"] + "_temp.json"))
os.remove(os.path.join(self.output_dir, self.info["tag"] + "_temp.json"))

self.info["end_time"] = time.time()
self.info[f"{self.tag}_total_duration"] = self.info["end_time"] - self.info["start_time"]

if self.num_examples:
self.info["examples_per_second"] = self.num_examples / self.info[f"{self.tag}_total_duration"]
self.info["end_disk_usage"] = shutil.disk_usage(os.path.expanduser("~")).used
self.info["disk_footprint"] = self.info["end_disk_usage"] - self.info["start_disk_usage"]

for key in self.info["system"]:
if "gpu_" in key:
self.info["system"][key]["max_memory_used"] = max(self.info["system"][key]["memory_used"])
self.info["system"]["max_cpu_utilization"] = max(self.info["system"]["cpu_utilization"])
self.info["system"]["max_ram_utilization"] = max(self.info["system"]["ram_utilization"])

self.info["system"]["average_cpu_utilization"] = mean(self.info["system"]["cpu_utilization"])
self.info["system"]["average_ram_utilization"] = mean(self.info["system"]["ram_utilization"])

save_json(os.path.join(self.output_dir, self.info["tag"] + "_metrics.json"), self.info)
4 changes: 4 additions & 0 deletions ludwig/utils/misc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,12 @@
import copy
import os
import random
import sys
import traceback
from collections import OrderedDict
from collections.abc import Mapping
from functools import wraps
from multiprocessing import Process, Queue

import numpy
import torch
Expand Down
3 changes: 3 additions & 0 deletions requirements_tracker.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
experiment_impact_tracker
gpustat
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be a separate requirements_tracker.txt file or do I need to add it to the main requirements.txt file?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd be ok with adding this to the main requirements.txt file, especially if hardware resource usage tracking adds marginal overhead.

Curious about other people's opinions on this: @dantreiman @w4nderlust @tgaddair

psutil