From b7bdd8c26847a1f9688a8c9c2593c07e4f78bd58 Mon Sep 17 00:00:00 2001 From: Jacob Freck Date: Thu, 16 Aug 2018 15:18:43 -0400 Subject: [PATCH] Feature: add brief flag to debug tool (#634) * add brief flag * add some docs * fix requirements --- .../client/cluster/helpers/diagnostics.py | 16 ++++---- aztk/spark/client/cluster/operations.py | 4 +- aztk/spark/utils/debug.py | 39 +++++++++++++++---- .../spark/endpoints/cluster/cluster_debug.py | 7 +++- docs/10-clusters.md | 9 +++++ requirements.txt | 12 +++--- 6 files changed, 61 insertions(+), 26 deletions(-) diff --git a/aztk/spark/client/cluster/helpers/diagnostics.py b/aztk/spark/client/cluster/helpers/diagnostics.py index e99f432f..e85be8d0 100644 --- a/aztk/spark/client/cluster/helpers/diagnostics.py +++ b/aztk/spark/client/cluster/helpers/diagnostics.py @@ -6,11 +6,11 @@ from aztk.utils import helpers -def _run(spark_cluster_operations, cluster_id, output_directory=None): +def _run(spark_cluster_operations, cluster_id, output_directory=None, brief=False): # copy debug program to each node output = spark_cluster_operations.copy( cluster_id, os.path.abspath("./aztk/spark/utils/debug.py"), "/tmp/debug.py", host=True) - ssh_cmd = _build_diagnostic_ssh_command() + ssh_cmd = _build_diagnostic_ssh_command(brief) run_output = spark_cluster_operations.run(cluster_id, ssh_cmd, host=True) remote_path = "/tmp/debug.zip" result = None @@ -19,25 +19,25 @@ def _run(spark_cluster_operations, cluster_id, output_directory=None): result = spark_cluster_operations.download(cluster_id, remote_path, local_path, host=True) # write run output to debug/ directory - with open(os.path.join(os.path.dirname(local_path), "debug-output.txt"), 'w', encoding="UTF-8") as f: - [f.write(line + '\n') for node_output in run_output for line in node_output.output] + with open(os.path.join(output_directory, "debug-output.txt"), 'w', encoding="UTF-8") as f: + [f.write(node_output.output + '\n') for node_output in run_output] else: result = spark_cluster_operations.download(cluster_id, remote_path, host=True) return result -def _build_diagnostic_ssh_command(): +def _build_diagnostic_ssh_command(brief): return "sudo rm -rf /tmp/debug.zip; "\ "sudo apt-get install -y python3-pip; "\ "sudo -H pip3 install --upgrade pip; "\ "sudo -H pip3 install docker; "\ - "sudo python3 /tmp/debug.py" + "sudo python3 /tmp/debug.py {}".format(brief) -def run_cluster_diagnostics(spark_cluster_operations, cluster_id, output_directory=None): +def run_cluster_diagnostics(spark_cluster_operations, cluster_id, output_directory=None, brief=False): try: - output = _run(spark_cluster_operations, cluster_id, output_directory) + output = _run(spark_cluster_operations, cluster_id, output_directory, brief) return output except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/operations.py b/aztk/spark/client/cluster/operations.py index cd7140ea..edc8fb9c 100644 --- a/aztk/spark/client/cluster/operations.py +++ b/aztk/spark/client/cluster/operations.py @@ -194,7 +194,7 @@ def download(self, return download.cluster_download(self._core_cluster_operations, id, source_path, destination_path, host, internal, timeout) - def diagnostics(self, id, output_directory=None): + def diagnostics(self, id, output_directory: str = None, brief: bool = False): """Download a file from every node in a cluster. Args: @@ -206,7 +206,7 @@ def diagnostics(self, id, output_directory=None): Returns: :obj:`List[aztk.spark.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command. """ - return diagnostics.run_cluster_diagnostics(self, id, output_directory) + return diagnostics.run_cluster_diagnostics(self, id, output_directory, brief) def get_application_log(self, id: str, application_name: str, tail=False, current_bytes: int = 0): """Get the log for a running or completed application diff --git a/aztk/spark/utils/debug.py b/aztk/spark/utils/debug.py index 6aca24c5..3dda40aa 100644 --- a/aztk/spark/utils/debug.py +++ b/aztk/spark/utils/debug.py @@ -6,6 +6,7 @@ import json import os import socket +import sys import tarfile from subprocess import STDOUT, CalledProcessError, check_output from zipfile import ZIP_DEFLATED, ZipFile @@ -14,16 +15,24 @@ def main(): - zipf = create_zip_archive() - - # general node diagnostics - zipf.writestr("hostname.txt", data=get_hostname()) - zipf.writestr("df.txt", data=get_disk_free()) + brief = sys.argv[1] == "True" # docker container diagnostics docker_client = docker.from_env() - for filename, data in get_docker_diagnostics(docker_client): - zipf.writestr(filename, data=data) + + zipf = create_zip_archive() + + if brief: + for filename, data in get_brief_diagnostics(): + print("writing {} to zip", filename) + zipf.writestr(filename, data=data) + else: + # general node diagnostics + zipf.writestr("hostname.txt", data=get_hostname()) + zipf.writestr("df.txt", data=get_disk_free()) + + for filename, data in get_docker_diagnostics(docker_client): + zipf.writestr(filename, data=data) zipf.close() @@ -99,7 +108,7 @@ def get_docker_containers(docker_client): def get_docker_process_status(container): try: - exit_code, output = container.exec_run("ps -auxw", tty=True, privileged=True) + exit_code, output = container.exec_run("ps faux", privileged=True) out_file_name = container.name + "/ps_aux.txt" if exit_code == 0: return (out_file_name, output) @@ -159,5 +168,19 @@ def extract_tar_in_memory(container, data): return logs +def get_brief_diagnostics(): + batch_dir = "/mnt/batch/tasks/startup/" + files = ["stdout.txt", "stderr.txt", "wd/logs/docker.log"] + logs = [] + for file_name in files: + try: + logs.append((file_name, open(batch_dir + file_name, 'rb').read())) + # print("LOG:", (file_name, open(batch_dir+file_name, 'rb').read())) + except FileNotFoundError as e: + print("file not found", e) + logs.append((file_name, bytes(e.__str__(), encoding="utf-8"))) + return logs + + if __name__ == "__main__": main() diff --git a/aztk_cli/spark/endpoints/cluster/cluster_debug.py b/aztk_cli/spark/endpoints/cluster/cluster_debug.py index 3675a0f0..f1a72707 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_debug.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_debug.py @@ -1,7 +1,7 @@ import argparse import os -import typing import time +import typing import aztk.spark from aztk_cli import config, utils @@ -11,6 +11,9 @@ def setup_parser(parser: argparse.ArgumentParser): parser.add_argument('--id', dest='cluster_id', required=True, help='The unique id of your spark cluster') parser.add_argument('--output', '-o', required=False, help='the directory for the output folder') + parser.add_argument( + '--brief', '-b', required=False, action='store_true', help='Only gets a small subset of key logs') + parser.set_defaults(brief=False) def execute(args: typing.NamedTuple): @@ -20,5 +23,5 @@ def execute(args: typing.NamedTuple): if not args.output: args.output = os.path.join(os.getcwd(), "debug-{0}-{1}".format(args.cluster_id, timestr)) with utils.Spinner(): - spark_client.cluster.diagnostics(id=args.cluster_id, output_directory=args.output) + spark_client.cluster.diagnostics(id=args.cluster_id, output_directory=args.output, brief=args.brief) # TODO: analyze results, display some info about status diff --git a/docs/10-clusters.md b/docs/10-clusters.md index 8c1c423e..86ad83a0 100644 --- a/docs/10-clusters.md +++ b/docs/10-clusters.md @@ -182,6 +182,15 @@ The debug utility will pull logs from all nodes in the cluster. The utility will __Please be careful sharing the output of the `debug` command as secrets and application code are present in the output.__ +Pass the `--brief` flag to only download the most essential logs from each node: +```sh +aztk spark cluster debug --id --output --brief +``` +This command will retrieve: +- stdout file from the node's startup +- stderr file from the node's startup +- the docker log for the spark container + ### Interact with your Spark cluster By default, the `aztk spark cluster ssh` command port forwards the Spark Web UI to *localhost:8080*, Spark Jobs UI to *localhost:4040*, and Spark History Server to your *localhost:18080*. This can be [configured in *.aztk/ssh.yaml*](../docs/13-configuration.html#sshyaml). diff --git a/requirements.txt b/requirements.txt index 3e88ed9c..89fb5ded 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,11 @@ # Distribution -azure-batch==4.1.3 -azure-mgmt-batch==5.0.0 -azure-mgmt-storage==1.5.0 -azure-storage-blob==1.1.0 -pyyaml==3.12 +azure-batch~=4.1.3 +azure-mgmt-batch~=5.0.0 +azure-mgmt-storage~=2.0.0 +azure-storage-blob~=1.3.1 +pyyaml>=3.12 pycryptodomex>=3.4 -paramiko==2.4.0 +paramiko>=2.4 # Development yapf==0.22.0