Skip to content

Commit

Permalink
Feature 1266 gen ens prod missing ensembles (#1275)
Browse files Browse the repository at this point in the history
  • Loading branch information
georgemccabe authored Nov 16, 2021
1 parent 083b80d commit 4fbb689
Show file tree
Hide file tree
Showing 8 changed files with 193 additions and 77 deletions.
31 changes: 30 additions & 1 deletion docs/use_cases/met_tool_wrapper/GenEnsProd/GenEnsProd.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,21 @@
# Scientific Objective
# --------------------
#
# Generate ensemble products.
# Generate ensemble products. This use case demonstrates how to configure
# the gen_ens_prod tool if you expect that there will occasionally be missing
# ensembles. 7 ensemble paths are specified but only 6 of them exist in the
# sample input data set. The wrapper will mark ensembles that are not found
# with the MISSING keyword in the file-list file that is read by the tool.
# Also, one of the ensembles is listed as the control member. The gen_ens_prod
# application will error and exit if the control member is included in the
# ensemble list, but the GenEnsProd wrapper will automatically remove the
# control member from the ensemble list. This makes it easier to configure
# the tool to change the control member without having to change the ensemble
# list. The number of expected members (defined with GEN_ENS_PROD_N_MEMBERS)
# is 6 (7 members - 1 control member). The actual number of ensemble members
# that will be found in this example is 5 (arw-tom-gep4 is not included).
# The ens.ens_thresh value (defined by GEN_ENS_PROD_ENS_THRESH) is set to 0.8.
# There are ~0.833 (5/6) valid ensemble members so the application will run.

##############################################################################
# Datasets
Expand Down Expand Up @@ -95,6 +109,21 @@
#
# * gen_ens_prod_20100101_120000V_ens.nc
#
# A file-list file will also be generated in stage/file_lists called:
#
# * 20091231120000_24_gen_ens_prod.txt
#
# It should contain a list of 6 files in {INPUT_BASE} with 1 file marked as
# missing because it was not found::
#
# file_list
# {INPUT_BASE}/met_test/data/sample_fcst/2009123112/arw-sch-gep2/d01_2009123112_02400.grib
# {INPUT_BASE}/met_test/data/sample_fcst/2009123112/arw-tom-gep3/d01_2009123112_02400.grib
# MISSING/{INPUT_BASE}/met_test/data/sample_fcst/2009123112/arw-tom-gep4/d01_2009123112_02400.grib
# {INPUT_BASE}/met_test/data/sample_fcst/2009123112/arw-fer-gep5/d01_2009123112_02400.grib
# {INPUT_BASE}/met_test/data/sample_fcst/2009123112/arw-sch-gep6/d01_2009123112_02400.grib
# {INPUT_BASE}/met_test/data/sample_fcst/2009123112/arw-tom-gep7/d01_2009123112_02400.grib
#

##############################################################################
# Keywords
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -568,10 +568,10 @@ def test_ensemble_stat_single_field(metplus_config, config_overrides,
config_file = wrapper.c_dict.get('CONFIG_FILE')
out_dir = wrapper.c_dict.get('OUTPUT_DIR')
expected_cmds = [(f"{app_path} {verbosity} "
f"{file_list_dir}/20050807000000_12_ensemble.txt "
f"{file_list_dir}/20050807000000_12_ensemble_stat.txt "
f"{config_file} -outdir {out_dir}/2005080712"),
(f"{app_path} {verbosity} "
f"{file_list_dir}/20050807120000_12_ensemble.txt "
f"{file_list_dir}/20050807120000_12_ensemble_stat.txt "
f"{config_file} -outdir {out_dir}/2005080800"),
]

Expand Down
6 changes: 5 additions & 1 deletion metplus/util/met_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2282,7 +2282,11 @@ def format_var_items(field_configs, time_info=None):
return var_items

def find_var_name_indices(config, data_types, met_tool=None):
data_type_regex = f"{'|'.join(data_types)}|BOTH"
data_type_regex = f"{'|'.join(data_types)}"

# if data_types includes FCST or OBS, also search for BOTH
if any([item for item in ['FCST', 'OBS'] if item in data_types]):
data_type_regex += '|BOTH'

regex_string = f"({data_type_regex})"

Expand Down
104 changes: 100 additions & 4 deletions metplus/wrappers/command_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from ..util import do_string_sub, ti_calculate, get_seconds_from_string
from ..util import config_metplus
from ..util import METConfigInfo as met_config
from ..util import MISSING_DATA_VALUE

# pylint:disable=pointless-string-statement
'''!@namespace CommandBuilder
Expand Down Expand Up @@ -659,6 +660,8 @@ def find_exact_file(self, level, data_type, time_info, mandatory=True,
# then add it back after the string sub call
saved_level = time_info.pop('level', None)

input_must_exist = self.c_dict.get('INPUT_MUST_EXIST', True)

for template in template_list:
# perform string substitution
filename = do_string_sub(template,
Expand All @@ -671,9 +674,9 @@ def find_exact_file(self, level, data_type, time_info, mandatory=True,
if os.path.sep not in full_path:
self.logger.debug(f"{full_path} is not a file path. "
"Returning that string.")
if return_list:
full_path = [full_path]
return full_path
check_file_list.append(full_path)
input_must_exist = False
continue

self.logger.debug(f"Looking for {data_type}INPUT file {full_path}")

Expand Down Expand Up @@ -719,7 +722,7 @@ def find_exact_file(self, level, data_type, time_info, mandatory=True,

for file_path in check_file_list:
# if file doesn't need to exist, skip check
if not self.c_dict.get('INPUT_MUST_EXIST', True):
if not input_must_exist:
found_file_list.append(file_path)
continue

Expand All @@ -736,6 +739,9 @@ def find_exact_file(self, level, data_type, time_info, mandatory=True,
f"using template {template}")
if not mandatory or not self.c_dict.get('MANDATORY', True):
self.logger.warning(msg)
if self.c_dict.get(f'{data_type}FILL_MISSING'):
found_file_list.append(f'MISSING{file_path}')
continue
else:
self.log_error(msg)

Expand Down Expand Up @@ -843,6 +849,96 @@ def find_file_in_window(self, level, data_type, time_info, mandatory=True,

return out

def find_input_files_ensemble(self, time_info):
"""! Get a list of all input files and optional control file.
Warn and remove control file if found in ensemble list. Ensure that
if defined, the number of ensemble members (N_MEMBERS) corresponds to
the file list that was found.
@param time_info dictionary containing timing information
@returns True on success
"""
# get list of ensemble files to process
input_files = self.find_model(time_info, return_list=True)
if not input_files:
self.log_error("Could not find any input files")
return False

# get control file if requested
if self.c_dict.get('CTRL_INPUT_TEMPLATE'):
ctrl_file = self.find_data(time_info, data_type='CTRL')

# return if requested control file was not found
if not ctrl_file:
return False

self.args.append(f'-ctrl {ctrl_file}')

# check if control file is found in ensemble list
if ctrl_file in input_files:
# warn and remove control file if found
self.logger.warning(f"Control file found in ensemble list: "
f"{ctrl_file}. Removing from list.")
input_files.remove(ctrl_file)

# compare number of files found to expected number of members
if not self._check_expected_ensembles(input_files):
return False

# write file that contains list of ensemble files
list_filename = (f"{time_info['init_fmt']}_"
f"{time_info['lead_hours']}_{self.app_name}.txt")
list_file = self.write_list_file(list_filename, input_files)
if not list_file:
self.log_error("Could not write filelist file")
return False

self.infiles.append(list_file)

return True

def _check_expected_ensembles(self, input_files):
"""! Helper function for find_input_files_ensemble().
If number of expected ensemble members was defined in the config,
then ensure that the number of files found correspond to the expected
number. If more files were found, error and return False. If fewer
files were found, fill in input_files list with MISSING to allow valid
threshold check inside MET tool to work properly.
"""
num_expected = self.c_dict['N_MEMBERS']

# if expected members count is unset, skip check
if num_expected == MISSING_DATA_VALUE:
return True

num_found = len(input_files)

# error and return if more than expected number was found
if num_found > num_expected:
self.log_error(
"Found more files than expected! "
f"Found {num_found} expected {num_expected}. "
"Adjust wildcard expression in template or adjust "
"number of expected members (N_MEMBERS). "
f"Files found: {input_files}"
)
return False

# if fewer files found than expected, warn and add fake files
if num_found < num_expected:
self.logger.warning(
f"Found fewer files than expected. "
f"Found {num_found} expected {num_expected}"
)
# add fake files to list for ens_thresh checking
diff = num_expected - num_found
self.logger.warning(f'Adding {diff} fake files to '
'ensure ens_thresh check is accurate')
for _ in range(0, diff, 1):
input_files.append('MISSING')

return True

def write_list_file(self, filename, file_list, output_dir=None):
"""! Writes a file containing a list of filenames to the staging dir
Expand Down
28 changes: 15 additions & 13 deletions metplus/wrappers/ensemble_stat_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,11 +153,18 @@ def create_c_dict(self):
elif c_dict['OBS_GRID_INPUT_DATATYPE'] in util.PYTHON_EMBEDDING_TYPES:
c_dict['OBS_INPUT_DATATYPE'] = c_dict['OBS_GRID_INPUT_DATATYPE']

c_dict['N_MEMBERS'] = \
self.config.getint('config', 'ENSEMBLE_STAT_N_MEMBERS', -1)
c_dict['N_MEMBERS'] = (
self.config.getint('config', 'ENSEMBLE_STAT_N_MEMBERS')
)

# allow multiple files in CommandBuilder.find_data logic
c_dict['ALLOW_MULTIPLE_FILES'] = True

# not all input files are mandatory to be found
c_dict['MANDATORY'] = False

if c_dict['N_MEMBERS'] < 0:
self.log_error("Must set ENSEMBLE_STAT_N_MEMBERS to a integer > 0")
# fill inputs that are not found with fake path to note it is missing
c_dict['FCST_FILL_MISSING'] = True

c_dict['OBS_POINT_INPUT_DIR'] = \
self.config.getdir('OBS_ENSEMBLE_STAT_POINT_INPUT_DIR', '')
Expand All @@ -177,11 +184,9 @@ def create_c_dict(self):
c_dict['FCST_INPUT_DIR'] = \
self.config.getdir('FCST_ENSEMBLE_STAT_INPUT_DIR', '')

# This is a raw string and will be interpreted to generate the
# ensemble member filenames. This may be a list of 1 or n members.
c_dict['FCST_INPUT_TEMPLATE'] = \
util.getlist(self.config.getraw('filename_templates',
'FCST_ENSEMBLE_STAT_INPUT_TEMPLATE'))
c_dict['FCST_INPUT_TEMPLATE'] = (
self.config.getraw('config', 'FCST_ENSEMBLE_STAT_INPUT_TEMPLATE')
)
if not c_dict['FCST_INPUT_TEMPLATE']:
self.log_error("Must set FCST_ENSEMBLE_STAT_INPUT_TEMPLATE")

Expand Down Expand Up @@ -364,12 +369,9 @@ def run_at_time_all_fields(self, time_info):
@param time_info dictionary containing timing information
"""
# get ensemble model files
fcst_file_list = self.find_model_members(time_info)
if not fcst_file_list:
if not self.find_input_files_ensemble(time_info):
return

self.infiles.append(fcst_file_list)

# parse var list for ENS fields
ensemble_var_list = util.sub_var_list(self.c_dict['ENS_VAR_LIST_TEMP'],
time_info)
Expand Down
77 changes: 26 additions & 51 deletions metplus/wrappers/gen_ens_prod_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from ..util import do_string_sub, ti_calculate, get_lead_sequence
from ..util import skip_time, parse_var_list, sub_var_list

from . import LoopTimesWrapper

class GenEnsProdWrapper(LoopTimesWrapper):
Expand All @@ -21,8 +22,8 @@ class GenEnsProdWrapper(LoopTimesWrapper):
'METPLUS_CAT_THRESH',
'METPLUS_NC_VAR_STR',
'METPLUS_ENS_FILE_TYPE',
'METPLUS_ENS_ENS_THRESH',
'METPLUS_ENS_VLD_THRESH',
'METPLUS_ENS_THRESH',
'METPLUS_VLD_THRESH',
'METPLUS_ENS_FIELD',
'METPLUS_NBRHD_PROB_DICT',
'METPLUS_NMEP_SMOOTH_DICT',
Expand Down Expand Up @@ -65,15 +66,27 @@ def create_c_dict(self):
)

# get input template/dir - template is required
c_dict['INPUT_TEMPLATE'] = self.config.getraw(
c_dict['FCST_INPUT_TEMPLATE'] = self.config.getraw(
'config',
'GEN_ENS_PROD_INPUT_TEMPLATE'
)
c_dict['INPUT_DIR'] = self.config.getdir('GEN_ENS_PROD_INPUT_DIR', '')
c_dict['FCST_INPUT_DIR'] = self.config.getdir('GEN_ENS_PROD_INPUT_DIR',
'')

if not c_dict['INPUT_TEMPLATE']:
if not c_dict['FCST_INPUT_TEMPLATE']:
self.log_error('GEN_ENS_PROD_INPUT_TEMPLATE must be set')

# not all input files are mandatory to be found
c_dict['MANDATORY'] = False

# fill inputs that are not found with fake path to note it is missing
c_dict['FCST_FILL_MISSING'] = True

# number of expected ensemble members
c_dict['N_MEMBERS'] = (
self.config.getint('config', 'GEN_ENS_PROD_N_MEMBERS')
)

# get ctrl (control) template/dir - optional
c_dict['CTRL_INPUT_TEMPLATE'] = self.config.getraw(
'config',
Expand Down Expand Up @@ -195,68 +208,30 @@ def run_at_time_once(self, time_info):
@param time_info dictionary containing timing information
"""
# add config file to arguments
config_file = do_string_sub(self.c_dict['CONFIG_FILE'], **time_info)
self.args.append(f"-config {config_file}")

if not self.find_field_info(time_info):
return False

if not self.find_input_files(time_info):
if not self.find_input_files_ensemble(time_info):
return False

if not self.find_and_check_output_file(time_info):
return False

# add config file to arguments
config_file = do_string_sub(self.c_dict['CONFIG_FILE'], **time_info)
self.args.append(f"-config {config_file}")

if not self.find_ctrl_file(time_info):
return False

# set environment variables that are passed to the MET config
self.set_environment_variables(time_info)

return self.build()

def find_input_files(self, time_info):
"""! Get a list of all input files
@param time_info dictionary containing timing information
@returns True on success
"""
input_files = self.find_data(time_info, return_list=True)
if not input_files:
self.log_error("Could not find any input files")
return False

# write file that contains list of ensemble files
list_filename = (f"{time_info['init_fmt']}_"
f"{time_info['lead_hours']}_gen_ens_prod.txt")
list_file = self.write_list_file(list_filename, input_files)
if not list_file:
self.log_error("Could not write filelist file")
return False

self.infiles.append(list_file)

return True

def find_ctrl_file(self, time_info):
"""! Find optional ctrl (control) file if requested
def find_field_info(self, time_info):
"""! parse var list for ENS fields
@param time_info dictionary containing timing information
@returns True on success or if ctrl not requested
@returns True if successful, False if something went wrong
"""
if not self.c_dict['CTRL_INPUT_TEMPLATE']:
return True

input_file = self.find_data(time_info, data_type='CTRL')
if not input_file:
return False

self.args.append(f'-ctrl {input_file}')
return True

def find_field_info(self, time_info):
# parse var list for ENS fields
ensemble_var_list = sub_var_list(self.c_dict['ENS_VAR_LIST_TEMP'],
time_info)
all_fields = []
Expand Down
Loading

0 comments on commit 4fbb689

Please sign in to comment.