Skip to content

Commit

Permalink
progress: identify failed simulation runs
Browse files Browse the repository at this point in the history
- marking status of simulation runs as succeeded/failed
- this signals that failed runs should not be simulated again, e.g., after a restart (only after the underlying problem has been resolved and those suids reset)
- define progress status
  • Loading branch information
dschlaep committed Jan 7, 2024
1 parent 5be7b20 commit 217226f
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 18 deletions.
2 changes: 1 addition & 1 deletion include/SW_Domain.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Bool SW_DOM_CheckProgress(int progFileID, int progVarID,
unsigned long ncSuid[], LOG_INFO* LogInfo);
void SW_DOM_CreateProgress(SW_DOMAIN* SW_Domain, LOG_INFO* LogInfo);
void SW_DOM_read(SW_DOMAIN* SW_Domain, LOG_INFO* LogInfo);
void SW_DOM_SetProgress(const char* domType, int progFileID,
void SW_DOM_SetProgress(Bool success, const char* domType, int progFileID,
int progVarID, unsigned long ncSuid[],
LOG_INFO* LogInfo);
void SW_DOM_SimSet(SW_DOMAIN* SW_Domain, unsigned long userSUID,
Expand Down
2 changes: 1 addition & 1 deletion include/SW_netCDF.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ void SW_NC_create_template(const char* domFile, int domFileID,
const char* attNames[], const char* attVals[], int numAtts, Bool isInput,
const char* freq, LOG_INFO* LogInfo);
void SW_NC_create_progress(SW_DOMAIN* SW_Domain, LOG_INFO* LogInfo);
void SW_NC_set_progress(const char* domType, int progFileID,
void SW_NC_set_progress(Bool success, const char* domType, int progFileID,
int progVarID, unsigned long ncSUID[],
LOG_INFO* LogInfo);
Bool SW_NC_check_progress(int progFileID, int progVarID,
Expand Down
12 changes: 6 additions & 6 deletions src/SW_Control.c
Original file line number Diff line number Diff line change
Expand Up @@ -256,10 +256,9 @@ void SW_CTL_RunSimSet(SW_ALL *sw_template, SW_OUTPUT_POINTERS SW_OutputPtrs[],
SW_WT_TimeRun(tsr, ok_tsr, SW_WallTime);

/* Report progress for suid */
if(!local_LogInfo.stopRun) {
SW_DOM_SetProgress(SW_Domain->DomainType, progFileID,
progVarID, ncSuid, &local_LogInfo);
}
SW_DOM_SetProgress(!local_LogInfo.stopRun,
SW_Domain->DomainType, progFileID,
progVarID, ncSuid, &local_LogInfo);
}

/* Report errors and warnings for suid */
Expand All @@ -278,11 +277,12 @@ void SW_CTL_RunSimSet(SW_ALL *sw_template, SW_OUTPUT_POINTERS SW_OutputPtrs[],
}

/* Produce global error if all suids failed */
if (nSims == main_LogInfo->numDomainErrors) {
if (nSims > 0 && nSims == main_LogInfo->numDomainErrors) {
LogError(
main_LogInfo,
LOGERROR,
"All simulated units produced errors."
"All simulated units (n = %zu) produced errors.",
nSims
);
}

Expand Down
8 changes: 5 additions & 3 deletions src/SW_Domain.c
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,9 @@ void SW_DOM_read(SW_DOMAIN* SW_Domain, LOG_INFO* LogInfo) {
}

/**
* @brief Mark a completed suid in progress netCDF
* @brief Mark completion status of simulation run
*
* @param[in] success Did simulation run succeed or fail?
* @param[in] domType Type of domain in which simulations are running
* (gridcell/sites)
* @param[in] progFileID Identifier of the progress netCDF file
Expand All @@ -265,13 +266,14 @@ void SW_DOM_read(SW_DOMAIN* SW_Domain, LOG_INFO* LogInfo) {
* in relation to netCDFs
* @param[in,out] LogInfo
*/
void SW_DOM_SetProgress(const char* domType, int progFileID,
void SW_DOM_SetProgress(Bool success, const char* domType, int progFileID,
int progVarID, unsigned long ncSuid[],
LOG_INFO* LogInfo) {

#if defined(SWNETCDF)
SW_NC_set_progress(domType, progFileID, progVarID, ncSuid, LogInfo);
SW_NC_set_progress(success, domType, progFileID, progVarID, ncSuid, LogInfo);
#else
(void) success
(void) progFileID;
(void) progVarID;
(void) ncSuid;
Expand Down
19 changes: 12 additions & 7 deletions src/SW_netCDF.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@
#define NUM_NC_IN_KEYS 2 // Number of possible keys within `files_nc.in`
#define NUM_ATT_IN_KEYS 25 // Number of possible keys within `attributes_nc.in`

#define PRGRSS_READY ((signed char)0) // SUID is ready for simulation
#define PRGRSS_DONE ((signed char)1) // SUID has successfully been simulated
#define PRGRSS_FAIL ((signed char)-1) // SUID failed to simulate


/* =================================================== */
/* Local Function Definitions */
/* --------------------------------------------------- */
Expand Down Expand Up @@ -646,7 +651,6 @@ static void fill_prog_netCDF_vals(SW_DOMAIN* SW_Domain, LOG_INFO* LogInfo) {

int domVarID = SW_Domain->netCDFInfo.ncVarIDs[vNCdom];
int progVarID = SW_Domain->netCDFInfo.ncVarIDs[vNCprog];
signed char readyVal = 0;
unsigned int domStatus;
unsigned long suid, ncSuid[2], nSUIDs = SW_Domain->nSUIDs;
unsigned long nDimY = SW_Domain->nDimY, nDimX = SW_Domain->nDimX;
Expand All @@ -672,7 +676,7 @@ static void fill_prog_netCDF_vals(SW_DOMAIN* SW_Domain, LOG_INFO* LogInfo) {
return; // Exit function prematurely due to error
}

vals[suid] = (domStatus == NC_FILL_UINT) ? NC_FILL_BYTE : readyVal;
vals[suid] = (domStatus == NC_FILL_UINT) ? NC_FILL_BYTE : PRGRSS_READY;
}

fill_netCDF_var_byte(progFileID, progVarID, vals, start, count, LogInfo);
Expand Down Expand Up @@ -1988,8 +1992,9 @@ void SW_NC_create_progress(SW_DOMAIN* SW_Domain, LOG_INFO* LogInfo) {
}

/**
* @brief Mark a site/gridcell as completed in the progress file
* @brief Mark a site/gridcell as completed (success/fail) in the progress file
*
* @param[in] success Did simulation run succeed or fail?
* @param[in] domType Type of domain in which simulations are running
* (gridcell/sites)
* @param[in] progFileID Identifier of the progress netCDF file
Expand All @@ -1998,11 +2003,11 @@ void SW_NC_create_progress(SW_DOMAIN* SW_Domain, LOG_INFO* LogInfo) {
* to get data from netCDF
* @param[in,out] LogInfo Holds information dealing with logfile output
*/
void SW_NC_set_progress(const char* domType, int progFileID,
void SW_NC_set_progress(Bool success, const char* domType, int progFileID,
int progVarID, unsigned long ncSUID[],
LOG_INFO* LogInfo) {

const signed char mark = 1;
const signed char mark = (success) ? PRGRSS_DONE : PRGRSS_FAIL;
size_t *count = (strcmp(domType, "s") == 0) ? (size_t[1]){1} : (size_t[2]){1, 1};

fill_netCDF_var_byte(progFileID, progVarID, &mark, ncSUID, count, LogInfo);
Expand All @@ -2022,11 +2027,11 @@ void SW_NC_set_progress(const char* domType, int progFileID,
Bool SW_NC_check_progress(int progFileID, int progVarID,
unsigned long ncSUID[], LOG_INFO* LogInfo) {

signed char progVal = 0, readyVal = 0;
signed char progVal = 0;

get_single_byte_val(progFileID, progVarID, ncSUID, &progVal, LogInfo);

return (Bool) (!LogInfo->stopRun && progVal == readyVal);
return (Bool) (!LogInfo->stopRun && progVal == PRGRSS_READY);
}

/**
Expand Down

0 comments on commit 217226f

Please sign in to comment.