Skip to content

Commit

Permalink
Fix: tools: crm_mon --one-shot fails while pacemaker is shutting down
Browse files Browse the repository at this point in the history
crm_mon --one-shot checks the pacemakerd state before trying to get a
CIB connection. If pacemakerd is shutting down, it returns ENOTCONN.
This can cause a resource agent that calls crm_mon (for example,
ocf:heartbeat:pgsql) to fail to stop during shutdown.

This is a regression introduced by commit 3f342e3.
crm_mon.c:pacemakerd_status() returns pcmk_rc_ok if pacemakerd is
shutting down, since 49ebe4c and 46d6edd (fixes for CLBZ#5471). 3f342e3
refactored crm_mon --one-shot to use library functions. pcmk__status()
now does most of the work, calling pcmk_status.c:pacemakerd_status().
That function returns ENOTCONN if pacemakerd is shutting down. As a
result, we don't try to connect to the CIB during shutdown.

Here we update pcmk__status() to use pcmk__pacemakerd_status() instead
of a static and mostly redundant pacemakerd_status(). It receives the
pacemakerd state via an output pointer argument. If pacemakerd is
running or shutting down (and also for some reason if we get an
EREMOTEIO rc), we try connecting to the fencer and CIB. However, as long
as we successfully get the pacemakerd state, we return success from
pcmk__status(), since we did obtain the cluster status.

A couple of minor notes:
* pcmk__status() now takes a timeout argument that it passes to
  pcmk__pacemakerd_status(). The previous implementation used
  pcmk_ipc_dispatch_sync, but pcmk__pacemakerd_status() uses
  pcmk_ipc_dispatch_main.
* The previous timeout was 5000 (hardcoded in crm_ipc_send() for
  synchronous dispatch). Now, if timeout_ms is nonpositive, 30000 is
  used as the timeout (hardcoded as
  pcmk_cluster_queries.c:DEFAULT_MESSAGE_TIMEOUT_MS).
* pcmk_cluster_queries.c:ipc_connect() no longer prints a "Could not
  connect" error for EREMOTEIO. The caller may consider it OK.

Fixes T579
Fixes CLBZ#5501

Signed-off-by: Reid Wahl <[email protected]>
  • Loading branch information
nrwahl2 committed Oct 11, 2022
1 parent ca30743 commit 82885ee
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 86 deletions.
2 changes: 1 addition & 1 deletion include/pcmki/pcmki_status.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ int pcmk__status(pcmk__output_t *out, cib_t *cib,
enum pcmk__fence_history fence_history, uint32_t show,
uint32_t show_opts, const char *only_node,
const char *only_rsc, const char *neg_location_prefix,
bool simple_output);
bool simple_output, int timeout_ms);

#ifdef __cplusplus
}
Expand Down
7 changes: 4 additions & 3 deletions lib/pacemaker/pcmk_cluster_queries.c
Original file line number Diff line number Diff line change
Expand Up @@ -274,9 +274,10 @@ ipc_connect(data_t *data, enum pcmk_ipc_server server, pcmk_ipc_callback_t cb)
}
rc = pcmk_connect_ipc(api, pcmk_ipc_dispatch_main);
if (rc != pcmk_rc_ok) {
out->err(out, "error: Could not connect to %s: %s",
pcmk_ipc_name(api, true),
pcmk_rc_str(rc));
if (rc != EREMOTEIO) {
out->err(out, "error: Could not connect to %s: %s",
pcmk_ipc_name(api, true), pcmk_rc_str(rc));
}
data->rc = rc;
pcmk_free_ipc_api(api);
return NULL;
Expand Down
129 changes: 48 additions & 81 deletions lib/pacemaker/pcmk_status.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,71 +70,6 @@ fencing_connect(void)
}
}

static void
pacemakerd_event_cb(pcmk_ipc_api_t *pacemakerd_api,
enum pcmk_ipc_event event_type, crm_exit_t status,
void *event_data, void *user_data)
{
pcmk_pacemakerd_api_reply_t *reply = event_data;
enum pcmk_pacemakerd_state *state =
(enum pcmk_pacemakerd_state *) user_data;

/* we are just interested in the latest reply */
*state = pcmk_pacemakerd_state_invalid;

if (event_type != pcmk_ipc_event_reply || status != CRM_EX_OK) {
return;
}

if (reply->reply_type == pcmk_pacemakerd_reply_ping &&
reply->data.ping.last_good != (time_t) 0 &&
reply->data.ping.status == pcmk_rc_ok) {
*state = reply->data.ping.state;
}
}

static int
pacemakerd_status(pcmk__output_t *out)
{
int rc = pcmk_rc_ok;
pcmk_ipc_api_t *pacemakerd_api = NULL;
enum pcmk_pacemakerd_state state = pcmk_pacemakerd_state_invalid;

rc = pcmk_new_ipc_api(&pacemakerd_api, pcmk_ipc_pacemakerd);
if (pacemakerd_api == NULL) {
out->err(out, "Could not connect to pacemakerd: %s",
pcmk_rc_str(rc));
return rc;
}

pcmk_register_ipc_callback(pacemakerd_api, pacemakerd_event_cb, (void *) &state);

rc = pcmk_connect_ipc(pacemakerd_api, pcmk_ipc_dispatch_sync);
if (rc == EREMOTEIO) {
return pcmk_rc_ok;
} else if (rc != pcmk_rc_ok) {
out->err(out, "Could not connect to pacemakerd: %s",
pcmk_rc_str(rc));
pcmk_free_ipc_api(pacemakerd_api);
return rc;
}

rc = pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name);

if (rc != pcmk_rc_ok) {
/* Got some error from pcmk_pacemakerd_api_ping, so return it. */
} else if (state == pcmk_pacemakerd_state_running) {
rc = pcmk_rc_ok;
} else if (state == pcmk_pacemakerd_state_shutting_down) {
rc = ENOTCONN;
} else {
rc = EAGAIN;
}

pcmk_free_ipc_api(pacemakerd_api);
return rc;
}

/*!
* \internal
* \brief Output the cluster status given a fencer and CIB connection
Expand Down Expand Up @@ -257,7 +192,7 @@ pcmk_status(xmlNodePtr *xml)
stonith__register_messages(out);

rc = pcmk__status(out, cib, pcmk__fence_history_full, pcmk_section_all,
show_opts, NULL, NULL, NULL, false);
show_opts, NULL, NULL, NULL, false, 0);
pcmk__xml_output_finish(out, xml);

cib_delete(cib);
Expand Down Expand Up @@ -289,41 +224,70 @@ pcmk_status(xmlNodePtr *xml)
* \param[in] simple_output Whether to use a simple output format.
* Note: This is for use by \p crm_mon only
* and is planned to be deprecated.
* \param[in] timeout_ms How long to wait for a reply from the
* \p pacemakerd API (if 0 or negative, a
* reasonable default will be used)
*
* \return Standard Pacemaker return code
*/
int
pcmk__status(pcmk__output_t *out, cib_t *cib,
enum pcmk__fence_history fence_history, uint32_t show,
uint32_t show_opts, const char *only_node, const char *only_rsc,
const char *neg_location_prefix, bool simple_output)
const char *neg_location_prefix, bool simple_output,
int timeout_ms)
{
xmlNode *current_cib = NULL;
int rc = pcmk_rc_ok;
stonith_t *stonith = NULL;
enum pcmk_pacemakerd_state state = pcmk_pacemakerd_state_invalid;

if (cib == NULL) {
return ENOTCONN;
}

if (cib->variant == cib_native) {
if (cib->state == cib_connected_query || cib->state == cib_connected_command) {
rc = pcmk_rc_ok;
} else {
rc = pacemakerd_status(out);
if ((cib->variant == cib_native)
&& (cib->state != cib_connected_query)
&& (cib->state != cib_connected_command)) {

rc = pcmk__pacemakerd_status(out, crm_system_name,
(timeout_ms < 0)? 0 : timeout_ms,
&state);
switch (rc) {
case pcmk_rc_ok:
switch (state) {
case pcmk_pacemakerd_state_running:
case pcmk_pacemakerd_state_shutting_down:
// CIB may still be available while shutting down
break;
default:
return rc;
}
break;
case EREMOTEIO:
// @TODO: Why do we try to connect to the fencer and CIB here?
crm_time_t *now = crm_time_new(NULL);
char *now_s = crm_time_as_string(now,
crm_time_log_date
|crm_time_log_timeofday
|crm_time_log_with_timezone);

rc = pcmk_rc_ok;
out->message(out, "pacemakerd-health",
CRM_SYSTEM_MCP, state,
"Running on Pacemaker Remote node, waiting to be "
"contacted by cluster", now_s);

crm_time_free(now);
free(now_s);
break;
default:
return rc;
}
}

if (rc != pcmk_rc_ok) {
return rc;
}

if (fence_history != pcmk__fence_history_none && cib->variant == cib_native) {
stonith = fencing_connect();

if (stonith == NULL) {
return ENOTCONN;
}
}

rc = cib_connect(out, cib, &current_cib);
Expand All @@ -335,6 +299,9 @@ pcmk__status(pcmk__output_t *out, cib_t *cib,
fence_history, show, show_opts, only_node,
only_rsc, neg_location_prefix,
simple_output);
if (rc != pcmk_rc_ok) {
out->err(out, "Error outputting status info from the fencer or CIB");
}

done:
if (stonith != NULL) {
Expand All @@ -350,7 +317,7 @@ pcmk__status(pcmk__output_t *out, cib_t *cib,
free_xml(current_cib);
}

return rc;
return pcmk_rc_ok;
}

/* This is an internal-only function that is planned to be deprecated and removed.
Expand Down
2 changes: 1 addition & 1 deletion tools/crm_mon.c
Original file line number Diff line number Diff line change
Expand Up @@ -1333,7 +1333,7 @@ one_shot(void)
int rc = pcmk__status(out, cib, fence_history, show, show_opts,
options.only_node, options.only_rsc,
options.neg_location_prefix,
output_format == mon_output_monitor);
output_format == mon_output_monitor, 0);

if (rc == pcmk_rc_ok) {
clean_up(pcmk_rc2exitc(rc));
Expand Down

0 comments on commit 82885ee

Please sign in to comment.