Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
jtwhite79 committed Dec 2, 2024
2 parents 932b217 + b20768b commit 5e36ab8
Show file tree
Hide file tree
Showing 7 changed files with 36 additions and 23 deletions.
Binary file not shown.
6 changes: 3 additions & 3 deletions documentation/pestpp_users_manual.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@

<img src="./media/image1.png" style="width:6.26806in;height:1.68194in" alt="A close up of a purple sign Description automatically generated" />

# <a id='s1' />Version 5.2.15
# <a id='s1' />Version 5.2.16

<img src="./media/image2.png" style="width:6.26806in;height:3.05972in" />

PEST++ Development Team

November 2024
December 2024

# <a id='s2' />Acknowledgements

Expand Down Expand Up @@ -70,7 +70,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI

# Table of Contents

- [Version 5.2.15](#s1)
- [Version 5.2.16](#s1)
- [Acknowledgements](#s2)
- [Preface](#s3)
- [License](#s4)
Expand Down
2 changes: 1 addition & 1 deletion src/libs/common/config_os.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#define CONFIG_OS_H_


#define PESTPP_VERSION "5.2.15";
#define PESTPP_VERSION "5.2.16";

#if defined(_WIN32) || defined(_WIN64)
#define OS_WIN
Expand Down
2 changes: 1 addition & 1 deletion src/libs/pestpp_common/EnsembleMethodUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7750,7 +7750,7 @@ void EnsembleMethod::reset_par_ensemble_to_prior_mean(){
ss << "iteration:" << iter;
vector<int> temp;
ofstream& frec = file_manager.rec_ofstream();
oe = oe_base;
oe.reserve(oe_base.get_real_names(),oe.get_var_names());
weights = weights_base;
run_ensemble_util(performance_log,frec,new_pe,oe,run_mgr_ptr,false,temp,NetPackage::NULL_DA_CYCLE, ss.str());
pe = new_pe;
Expand Down
4 changes: 2 additions & 2 deletions src/libs/run_managers/yamr/PantherAgent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ void PANTHERAgent::process_ctl_file(const string &ctl_filename)
mi.set_fill_tpl_zeros(pest_scenario.get_pestpp_options().get_fill_tpl_zeros());
mi.set_tpl_force_decimal(pest_scenario.get_pestpp_options().get_tpl_force_decimal());
mi.set_num_threads(pest_scenario.get_pestpp_options().get_num_tpl_ins_threads());
mi.set_sleep_ms(100);
mi.set_sleep_ms(5);
restart_on_error = pest_scenario.get_pestpp_options().get_panther_agent_restart_on_error();
max_time_without_master_ping_seconds = pest_scenario.get_pestpp_options().get_panther_agent_no_ping_timeout_secs();
FileManager fm("panther_agent");
Expand Down Expand Up @@ -538,7 +538,7 @@ std::pair<NetPackage::PackType,std::string> PANTHERAgent::run_model(Parameters &
void PANTHERAgent::run_async(pest_utils::thread_flag* terminate, pest_utils::thread_flag* finished, exception_ptr& run_exception,
Parameters* pars, Observations* obs)
{
mi.set_sleep_ms(100);
mi.set_sleep_ms(5);
mi.run(terminate,finished,run_exception, pars, obs);
}

Expand Down
41 changes: 26 additions & 15 deletions src/libs/run_managers/yamr/RunManagerPanther.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,9 @@ const int RunManagerPanther::N_PINGS_UNRESPONSIVE = 3;
const int RunManagerPanther::MIN_PING_INTERVAL_SECS = 60; // Ping each slave at most once every minute
const int RunManagerPanther::MAX_PING_INTERVAL_SECS = 120; // Ping each slave at least once every 2 minutes
const int RunManagerPanther::MAX_CONCURRENT_RUNS_LOWER_LIMIT = 1;
const int RunManagerPanther::IDLE_THREAD_SIGNAL_TIMEOUT_SECS = 10; // Allow up to 10s for the run_idle_async() thread to acknowledge signals (pause idling, terminate)

const int RunManagerPanther::IDLE_THREAD_SIGNAL_TIMEOUT_SECS = 10; // Allow up to 10s for the run_idle_async() thread to acknowledge signals (pause idling, terminate)
const double RunManagerPanther::MIN_AVGRUNMINS_FOR_KILL = 0.08; //minimum avg runtime to try to kill and/or resched runs
const int RunManagerPanther::SECONDS_BETWEEN_ECHOS = 1;

AgentInfoRec::AgentInfoRec(int _socket_fd)
{
Expand Down Expand Up @@ -520,6 +521,7 @@ RunManagerAbstract::RUN_UNTIL_COND RunManagerPanther::run_until(RUN_UNTIL_COND c
}

std::chrono::system_clock::time_point start_time = std::chrono::system_clock::now();
last_echo_time = std::chrono::system_clock::now();
double run_time_sec = 0.0;
while (!all_runs_complete() && terminate_reason == RUN_UNTIL_COND::NORMAL)
{
Expand Down Expand Up @@ -560,7 +562,7 @@ RunManagerAbstract::RUN_UNTIL_COND RunManagerPanther::run_until(RUN_UNTIL_COND c
}

}
w_sleep(100);
w_sleep(10);
n_no_ops = 0;
while (true)
{
Expand Down Expand Up @@ -726,7 +728,7 @@ void RunManagerPanther::run_idle_async()
idling.set(false);

// Sleep 1s to avoid spinlock
w_sleep(100);
w_sleep(10);
continue;
}

Expand Down Expand Up @@ -816,7 +818,7 @@ void RunManagerPanther::end_run_idle_async()
}

// Sleep to avoid spinlock
w_sleep(50);
w_sleep(10);
}

report("Stopped idle ping thread, as Panther manager is shutting down.", false);
Expand Down Expand Up @@ -857,7 +859,7 @@ void RunManagerPanther::pause_idle()
}

// Sleep to avoid spinlock
w_sleep(50);
w_sleep(10);
}

report("Panther idle ping thread paused prior to scheduling runs.", false);
Expand Down Expand Up @@ -947,7 +949,7 @@ bool RunManagerPanther::listen(pest_utils::thread_flag* terminate/* = nullptr*/)
fd_set read_fds; // temp file descriptor list for select()
socklen_t addr_len;
timeval tv;
tv.tv_sec = 1;
tv.tv_sec = 0;
tv.tv_usec = 0;
read_fds = master; // copy it
if (w_select(fdmax+1, &read_fds, NULL, NULL, &tv) == -1)
Expand Down Expand Up @@ -1006,7 +1008,7 @@ void RunManagerPanther::close_agents()
sock_nums.push_back(si.first);
for (auto si : sock_nums)
close_agent(si);
w_sleep(100);
w_sleep(10);

}
}
Expand Down Expand Up @@ -1107,7 +1109,7 @@ void RunManagerPanther::schedule_runs()
duration = it_agent->get_duration_minute();
avg_runtime = it_agent->get_runtime_minute();
if (avg_runtime <= 0) avg_runtime = global_avg_runtime;
if (avg_runtime <= 0) avg_runtime = 1.0E+10;
if (avg_runtime <= 0) avg_runtime = 1.0E+300;
vector<int> overdue_kill_runs_vec = get_overdue_runs_over_kill_threshold(run_id);

if (failure_map.count(run_id) + overdue_kill_runs_vec.size() >= max_n_failure)
Expand All @@ -1131,7 +1133,9 @@ void RunManagerPanther::schedule_runs()
should_schedule = true;
model_runs_timed_out += overdue_kill_runs_vec.size();
}
else if (((duration > overdue_giveup_minutes) || (duration > avg_runtime*overdue_giveup_fac))

else if (((duration > overdue_giveup_minutes) || ((duration > avg_runtime*overdue_giveup_fac) &&
(avg_runtime > MIN_AVGRUNMINS_FOR_KILL)))
&& free_agent_list.empty())
{
// If there are no free slaves kill the overdue ones
Expand All @@ -1147,7 +1151,8 @@ void RunManagerPanther::schedule_runs()
}
model_runs_timed_out += 1;
}
else if (duration > avg_runtime*overdue_reched_fac)

else if ((duration > avg_runtime*overdue_reched_fac) && (avg_runtime > MIN_AVGRUNMINS_FOR_KILL))
{
//check how many concurrent runs are going
if (n_concur < max_concurrent_runs) should_schedule = true;
Expand Down Expand Up @@ -1285,6 +1290,10 @@ void RunManagerPanther::echo()
{
if (!should_echo)
return;
std::chrono::system_clock::time_point now = chrono::system_clock::now();
if (chrono::duration_cast<std::chrono::seconds> ( now- last_echo_time).count() < SECONDS_BETWEEN_ECHOS)
return;
last_echo_time = now;
map<string, int> stats_map = get_agent_stats();
cout << get_time_string_short() << " mn:" << setw(5) << setprecision(2) << left << get_global_runtime_minute() << " runs("
<< "C" << setw(5) << left << model_runs_done
Expand Down Expand Up @@ -1939,7 +1948,9 @@ void RunManagerPanther::kill_all_active_runs()
if (avg_runtime <= 0) avg_runtime = get_global_runtime_minute();;
if (avg_runtime <= 0) avg_runtime = 1.0E+10;
duration = i->second->get_duration_minute();
if ((just_quit) || (duration > overdue_giveup_minutes) || (duration >= avg_runtime*overdue_giveup_fac))
if ((just_quit) || (duration > overdue_giveup_minutes) ||
((duration >= avg_runtime*overdue_giveup_fac) &&
(avg_runtime > MIN_AVGRUNMINS_FOR_KILL)))
{
sock_id_vec.push_back(i->second->get_socket_fd());
}
Expand Down Expand Up @@ -2132,7 +2143,7 @@ RunManagerPanther::~RunManagerPanther(void)
err = w_close(listener);
FD_CLR(listener, &master);
// this is needed to ensure that the first slave closes properly
w_sleep(500);
w_sleep(10);
for (int i = 0; i <= fdmax; i++)
{
if (FD_ISSET(i, &master))
Expand Down Expand Up @@ -2248,10 +2259,10 @@ void RunManagerYAMRCondor::cleanup(int cluster)
stringstream ss;
ss << "condor_rm " << cluster << " 1>cr_temp.stdout 2>cr_temp.stderr";
system(ss.str().c_str());
w_sleep(500);
w_sleep(10);
ss.str(string());
ss << "condor_rm " << cluster << " -forcex 1>cr_temp.stdout 2>cr_temp.stderr";
w_sleep(500);
w_sleep(10);
system(ss.str().c_str());
RunManagerPanther::close_agents();
cout << " all agents freed " << endl << endl;
Expand Down
4 changes: 3 additions & 1 deletion src/libs/run_managers/yamr/RunManagerPanther.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ class RunManagerPanther : public RunManagerAbstract
static const int MAX_PING_INTERVAL_SECS;
static const int MAX_CONCURRENT_RUNS_LOWER_LIMIT;
static const int IDLE_THREAD_SIGNAL_TIMEOUT_SECS;

static const double MIN_AVGRUNMINS_FOR_KILL;
static const int SECONDS_BETWEEN_ECHOS;
double overdue_reched_fac;
double overdue_giveup_fac;
double overdue_giveup_minutes;
Expand All @@ -141,6 +142,7 @@ class RunManagerPanther : public RunManagerAbstract
long long bytes_transferred;
int files_transferred;
bool should_echo;
std::chrono::system_clock::time_point last_echo_time;
int nftx;
fd_set master; // master file descriptor list
list<AgentInfoRec> agent_info_set;
Expand Down

0 comments on commit 5e36ab8

Please sign in to comment.