Skip to content

Commit

Permalink
Add GNU and Cheyenne Support to Automated RT (NOAA-EMC#444)
Browse files Browse the repository at this point in the history
* Add capability for GNU and Add support for Cheyenne

Co-authored-by: climbfuji <[email protected]>
  • Loading branch information
BrianCurtis-NOAA and climbfuji authored Mar 3, 2021
1 parent 4fbebd7 commit 0b8a889
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 65 deletions.
119 changes: 68 additions & 51 deletions tests/auto/rt_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ def parse_args_in():
parser = argparse.ArgumentParser()

# Setup Input Arguments
choices = ['hera.intel', 'orion.intel', 'gaea.intel', 'jet.intel', 'wcoss_dell_p3']
parser.add_argument('-m', '--machine', help='Machine and Compiler combination', required=True, choices=choices, type=str)
choices = ['cheyenne', 'hera', 'orion', 'gaea', 'jet', 'wcoss_dell_p3']
parser.add_argument('-m', '--machine', help='Machine name', required=True, choices=choices, type=str)
parser.add_argument('-w', '--workdir', help='Working directory', required=True, type=str)

# Get Arguments
Expand Down Expand Up @@ -73,15 +73,19 @@ def input_data(args):

def match_label_with_action(machine, actions, label):
''' Match the label that initiates a job with an action in the dict'''
# <machine>-<compiler>-<test> i.e. hera-gnu-RT
# RT = full regression test suite
logger = logging.getLogger('MATCH_LABEL_WITH_ACTIONS')
split_label = label.name.split('-')

if len(split_label) != 3: return False
if not re.match(split_label[0], 'Auto'): return False
if not re.match(split_label[2], machine['name'].split('.')[0]): return False
action_match = next((action for action in actions if re.match(action['name'], split_label[1])), False)

return action_match
if len(split_label) != 3: return False, False #Make sure it has three parts
if not re.match(split_label[0], machine['name']): return False, False #First check machine name matches
compiler = split_label[1]
if not str(compiler) in ["intel", "gnu"]: return False, False
action_match = next((action for action in actions if re.match(action['name'], split_label[2])), False)
action_match["command"] = f'export RT_COMPILER="{compiler}" && {action_match["command"]}'
if split_label[2] == "RT" and compiler == "gnu":
action_match["command"] = f'{action_match["command"]} -l rt_gnu.conf'
return compiler, action_match


def get_preqs_with_actions(repos, machine, ghinterface_obj, actions):
Expand All @@ -92,9 +96,10 @@ def get_preqs_with_actions(repos, machine, ghinterface_obj, actions):
preq_labels = [{'preq': pr, 'label': label} for pr in each_pr for label in pr.get_labels()]

for i, pr_label in enumerate(preq_labels):
match = match_label_with_action(machine, actions, pr_label['label'])
compiler, match = match_label_with_action(machine, actions, pr_label['label'])
if match:
preq_labels[i]['action'] = match
preq_labels[i]['compiler'] = compiler
else:
preq_labels[i] = False

Expand Down Expand Up @@ -130,8 +135,20 @@ def remove_pr_label(self):
self.logger.info(f'Removing Label: {self.preq_dict["label"]}')
self.preq_dict['preq'].remove_from_labels(self.preq_dict['label'])

def send_log_name_as_comment(self):
def check_label_before_job_start(self):
# LETS Check the label still exists before the start of the job in the
# case of multiple jobs
label_to_check = f'{self.machine["name"]}-{self.preq_dict["compiler"]}-{self.preq_dict["action"]["name"]}'
labels = self.preq_dict['preq'].get_labels()
label_match = next((label for label in labels if re.match(label.name, label_to_check)), False)

return label_match


def send_log_name_as_comment(self, log_filename):
logger = logging.getLogger('JOB/SEND_LOG_NAME_AS_COMMENT')

#Remove LAST MONTHS LOGS
logger.info('Removing last months logs (if any)')
last_month = datetime.date.today().replace(day=1) - datetime.timedelta(days=1)
rm_command = [[f'rm rt_auto_*_{last_month.strftime("%Y%m")}*.log', os.getcwd()]]
Expand All @@ -141,24 +158,16 @@ def send_log_name_as_comment(self):
except Exception as e:
logger.warning(f'"{rm_command}" failed with error:{e}')

new_log_name = f'rt_auto_{self.machine["name"]}_'\
f'{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}.log'
cp_command = [[f'cp rt_auto.log {new_log_name}', os.getcwd()]]
logger.info(f'Running "{cp_command}"')
# Add log information to PR.
comment_text = f'Log Name:{log_filename}\n'\
f'Log Location:{os.getcwd()}\n'\
'Logs are kept for one month'
try:
self.run_commands(cp_command)
self.preq_dict['preq'].create_issue_comment(comment_text)
except Exception as e:
logger.warning('Renaming rt_auto failed')
logger.warning('Creating comment with log location failed with:{e}')
else:
comment_text = f'Log Name:{new_log_name}\n'\
f'Log Location:{os.getcwd()}\n'\
'Logs are kept for one month'
try:
self.preq_dict['preq'].create_issue_comment(comment_text)
except Exception as e:
logger.warning('Creating comment with log location failed with:{e}')
else:
logger.info(f'{comment_text}')
logger.info(f'{comment_text}')

def run_commands(self, commands_with_cwd):
logger = logging.getLogger('JOB/RUN_COMMANDS')
Expand Down Expand Up @@ -212,9 +221,12 @@ def clone_pr_repo(self):
def run_function(self):
''' Run the command associted with the label used to initiate this job '''
logger = logging.getLogger('JOB/RUN_FUNCTION')
compiler = self.preq_dict['compiler']
logger.info(f'Compiler being used for command is {compiler}')
command = self.preq_dict["action"]["command"]
try:
logger.info(f'Running: "{self.preq_dict["action"]["command"]}" in "{self.pr_repo_loc}"')
output = subprocess.Popen(self.preq_dict['action']['command'], cwd=self.pr_repo_loc, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
logger.info(f'Running: "{command}" in "{self.pr_repo_loc}"')
output = subprocess.Popen(command, cwd=self.pr_repo_loc, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
out,err = output.communicate()
out = [] if not out else out.decode('utf8').split('\n')
err = [] if not err else err.decode('utf8').split('\n')
Expand All @@ -225,7 +237,13 @@ def run_function(self):
assert(e)
else:
if output.returncode != 0:
logger.critical(f'{self.preq_dict["action"]["command"]} Failed')
comment_text = f'rt.sh failed \n'\
f'machine: {self.machine["name"]} \n'\
f'compiler: {self.preq_dict["compiler"]}\n'\
f'STDOUT: {out} \n'\
f'STDERR: {err}'
self.preq_dict['preq'].create_issue_comment(comment_text)
logger.critical(f'{command} Failed')
[logger.critical(f'stdout: {item}') for item in out if not None]
[logger.critical(f'stderr: {eitem}') for eitem in err if not None]
else:
Expand All @@ -245,28 +263,27 @@ def run_function(self):
def move_rt_logs(self):
''' This is the callback function associated with the "RT" command '''
logger = logging.getLogger('JOB/MOVE_RT_LOGS')
rt_log = f'tests/RegressionTests_{self.machine["name"]}.log'
rt_log = f'tests/RegressionTests_{self.machine["name"]}.{self.preq_dict["compiler"]}.log'
filepath = f'{self.pr_repo_loc}/{rt_log}'
rm_filepath = '/'.join((self.pr_repo_loc.split('/'))[:-1])
if os.path.exists(filepath):
move_rt_commands = [
[f'git pull --ff-only origin {self.branch}', self.pr_repo_loc],
[f'git add {rt_log}', self.pr_repo_loc],
[f'git commit -m "Auto: Added Updated RT Log file: {rt_log}"', self.pr_repo_loc],
[f'git pull --no-edit origin {self.branch}', self.pr_repo_loc],
[f'git commit -m "Auto: Add RT Log file: {rt_log} skip-ci"', self.pr_repo_loc],
['sleep 10', self.pr_repo_loc],
[f'git push origin {self.branch}', self.pr_repo_loc]
]
self.run_commands(move_rt_commands)

else:
logger.critical('Could not find RT log')
raise FileNotFoundError('Could not find RT log')
logger.critical('Could not find Intel RT log')
raise FileNotFoundError('Could not find Intel RT log')

def main():

# handle logging
log_path = os.getcwd()
log_filename = 'rt_auto.log'
log_filename = f'rt_auto_{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}.log'
# Please don't run the following on cron with level=logging.DEBUG
# as it exposes the GH API Token
# Only set it to DEBUG while debugging
Expand All @@ -288,26 +305,26 @@ def main():
# get all pull requests from the GitHub object
logger.info('Getting all pull requests, labels and actions applicable to this machine.')
preq_dict = get_preqs_with_actions(repos, machine, ghinterface_obj, actions)

# add Job objects and run them
logger.info('Adding all jobs to an object list and running them.')
jobs = [Job(pullreq, ghinterface_obj, machine) for pullreq in preq_dict]
for job in jobs:
logger.info(f'Starting Job: {job}')
try:
logger.info('Calling remove_pr_label')
job.remove_pr_label()
logger.info('Calling clone_pr_repo')
job.clone_pr_repo()
logger.info('Calling run_function')
job.run_function()
logger.info('Calling remove_pr_dir')
job.remove_pr_dir()
logger.info('Calling send_log_name_as_comment')
job.send_log_name_as_comment()
except Exception as e:
logger.critical(e)
assert(e)
if job.check_label_before_job_start():
try:
logger.info('Calling remove_pr_label')
job.remove_pr_label()
logger.info('Calling clone_pr_repo')
job.clone_pr_repo()
logger.info('Calling run_function')
job.run_function()
logger.info('Calling remove_pr_dir')
# job.remove_pr_dir()
# logger.info('Calling send_log_name_as_comment')
job.send_log_name_as_comment(log_filename)
except Exception as e:
logger.critical(e)
assert(e)

logger.info('Script Finished')

Expand Down
36 changes: 22 additions & 14 deletions tests/auto/rt_auto.sh
Original file line number Diff line number Diff line change
@@ -1,45 +1,53 @@
#!/bin/bash --login
set -eux
if [ -f "accesstoken.sh" ]; then
source ./accesstoken.sh
if [ $(stat -L -c "%a" "accesstoken.sh") == "600" ]; then
echo "Sourcing accesstoken.sh"
source ./accesstoken.sh
else
echo "accesstoken.sh permissions NEED to be set to 600 before starting"
exit 1
fi
else
echo "Please create accesstoken.sh (600) with the following content\n"
echo "export ghapitoken=<GitHub API Token Here>"
exit 1
fi

export RT_COMPILER='intel'
source ../detect_machine.sh
echo "Machine ID: "+$MACHINE_ID
if [[ $MACHINE_ID = hera.* ]]; then
if [[ $HOSTNAME == hfe* ]]; then
MACHINE_NAME=hera
WORKDIR=/scratch1/NCEPDEV/nems/Brian.Curtis/test
export PATH=/scratch1/NCEPDEV/nems/emc.nemspara/soft/miniconda3/bin:$PATH
export PYTHONPATH=/scratch1/NCEPDEV/nems/emc.nemspara/soft/miniconda3/lib/python3.8/site-packages
elif [[ $MACHINE_ID = orion.* ]]; then
elif [[ $HOSTNAME == Orion-login-* ]]; then
MACHINE_NAME=orion
WORKDIR=/work/noaa/nems/bcurtis/test
export PATH=/work/noaa/nems/emc.nemspara/soft/miniconda3/bin:$PATH
export PYTHONPATH=/work/noaa/nems/emc.nemspara/soft/miniconda3/lib/python3.8/site-packages
elif [[ $MACHINE_ID = jet.* ]]; then
elif [[ $HOSTNAME == fe* ]]; then
MACHINE_NAME=jet
WORKDIR=/lfs4/HFIP/h-nems/Brian.Curtis/test
export ACCNR="h-nems"
export PATH=/lfs4/HFIP/hfv3gfs/software/miniconda3/4.8.3/envs/ufs-weather-model/bin:/lfs4/HFIP/hfv3gfs/software/miniconda3/4.8.3/bin:$PATH
export PYTHONPATH=/lfs4/HFIP/hfv3gfs/software/miniconda3/4.8.3/envs/ufs-weather-model/lib/python3.8/site-packages:/lfs4/HFIP/hfv3gfs/software/miniconda3/4.8.3/lib/python3.8/site-packages
elif [[ $MACHINE_ID = gaea.* ]]; then
elif [[ $HOSTNAME == gaea* ]]; then
MACHINE_NAME=gaea
WORKDIR=/lustre/f2/pdata/ncep/Brian.Curtis/test
export LOADEDMODULES=$LOADEDMODULES
export ACCNR="nggps_emc" # This applies to Brian.Curtis, may need change later
export PATH=/lustre/f2/pdata/esrl/gsd/contrib/miniconda3/4.8.3/envs/ufs-weather-model/bin:$PATH
export PYTHONPATH=/lustre/f2/pdata/esrl/gsd/contrib/miniconda3/4.8.3/lib/python3.8/site-packages
elif [[ $MACHINE_ID = cheyenne.* ]]; then
#export PATH=/glade/p/ral/jntp/tools/ecFlow-5.3.1/bin:$PATH
#export PYTHONPATH=/glade/p/ral/jntp/tools/ecFlow-5.3.1/lib/python2.7/site-packages
echo "cheyenne not currently supported. automated RT not starting"
exit 1
elif [[ $HOSTNAME == *.cheyenne.ucar.edu ]]; then
MACHINE_NAME=cheyenne
WORKDIR=/glade/work/heinzell/fv3/ufs-weather-model/auto-rt
export ACCNR="P48503002"
export PATH=/glade/p/ral/jntp/tools/miniconda3/4.8.3/envs/ufs-weather-model/bin:/glade/p/ral/jntp/tools/miniconda3/4.8.3/bin:$PATH
export PYTHONPATH=/glade/p/ral/jntp/tools/miniconda3/4.8.3/envs/ufs-weather-model/lib/python3.8/site-packages:/glade/p/ral/jntp/tools/miniconda3/4.8.3/lib/python3.8/site-packages
else
echo "No Python Path for this machine. automated RT not starting"
exit 1
fi

python rt_auto.py -m $MACHINE_ID -w $WORKDIR
python rt_auto.py -m $MACHINE_NAME -w $WORKDIR

exit 0
6 changes: 6 additions & 0 deletions tests/detect_machine.sh
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,12 @@ case $(hostname -f) in
cheyenne4.ib0.cheyenne.ucar.edu) MACHINE_ID=cheyenne ;; ### cheyenne4
cheyenne5.ib0.cheyenne.ucar.edu) MACHINE_ID=cheyenne ;; ### cheyenne5
cheyenne6.ib0.cheyenne.ucar.edu) MACHINE_ID=cheyenne ;; ### cheyenne6
chadmin1.ib0.cheyenne.ucar.edu) MACHINE_ID=cheyenne ;; ### cheyenne1
chadmin2.ib0.cheyenne.ucar.edu) MACHINE_ID=cheyenne ;; ### cheyenne1
chadmin3.ib0.cheyenne.ucar.edu) MACHINE_ID=cheyenne ;; ### cheyenne1
chadmin4.ib0.cheyenne.ucar.edu) MACHINE_ID=cheyenne ;; ### cheyenne1
chadmin5.ib0.cheyenne.ucar.edu) MACHINE_ID=cheyenne ;; ### cheyenne1
chadmin6.ib0.cheyenne.ucar.edu) MACHINE_ID=cheyenne ;; ### cheyenne1

login1.stampede2.tacc.utexas.edu) MACHINE_ID=stampede ;; ### stampede1
login2.stampede2.tacc.utexas.edu) MACHINE_ID=stampede ;; ### stampede2
Expand Down

0 comments on commit 0b8a889

Please sign in to comment.