Skip to content

Commit

Permalink
Fix: tools: crm_mon --one-shot fails while pacemaker is shutting down
Browse files Browse the repository at this point in the history
crm_mon --one-shot checks the pacemakerd state before trying to get a
CIB connection. If pacemakerd is shutting down, it returns ENOTCONN.
This can cause a resource agent that calls crm_mon (for example,
ocf:heartbeat:pgsql) to fail to stop during shutdown.

This is a regression introduced by commit 3f342e3.
crm_mon.c:pacemakerd_status() returns pcmk_rc_ok if pacemakerd is
shutting down, since 49ebe4c and 46d6edd (fixes for CLBZ#5471). 3f342e3
refactored crm_mon --one-shot to use library functions. pcmk__status()
now does most of the work, calling pcmk_status.c:pacemakerd_status().
That function returns ENOTCONN if pacemakerd is shutting down. As a
result, we don't try to connect to the CIB during shutdown.

Here we update pcmk__status() to use pcmk__pacemakerd_status() instead
of a static and mostly redundant pacemakerd_status(). It receives the
pacemakerd state via an output pointer argument. If pacemakerd is
running or shutting down (or if we get an EREMOTEIO rc), we try
connecting to the fencer and CIB. However, as long as we successfully
get the pacemakerd state, we return success from pcmk__status(), since
we did obtain the cluster status.

A couple of minor notes:
* pcmk__status() now takes a timeout argument that it passes to
  pcmk__pacemakerd_status(). timeout == 0 uses pcmk_ipc_dispatch_sync,
  matching the old implementation. A positive timeout uses
  pcmk_ipc_dispatch_main.
* pcmk_cluster_queries.c:ipc_connect() no longer always prints a "Could
  not connect" error for EREMOTEIO. The caller may consider it OK.

Fixes T579
Fixes CLBZ#5501

Signed-off-by: Reid Wahl <[email protected]>
  • Loading branch information
nrwahl2 committed Oct 11, 2022
1 parent cd62547 commit 6dd3934
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 90 deletions.
2 changes: 1 addition & 1 deletion include/pcmki/pcmki_status.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ int pcmk__status(pcmk__output_t *out, cib_t *cib,
enum pcmk__fence_history fence_history, uint32_t show,
uint32_t show_opts, const char *only_node,
const char *only_rsc, const char *neg_location_prefix,
bool simple_output);
bool simple_output, guint timeout_ms);

#ifdef __cplusplus
}
Expand Down
27 changes: 20 additions & 7 deletions lib/pacemaker/pcmk_cluster_queries.c
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ pacemakerd_event_cb(pcmk_ipc_api_t *pacemakerd_api,

static pcmk_ipc_api_t *
ipc_connect(data_t *data, enum pcmk_ipc_server server, pcmk_ipc_callback_t cb,
enum pcmk_ipc_dispatch dispatch_type)
enum pcmk_ipc_dispatch dispatch_type, bool eremoteio_ok)
{
int rc;
pcmk__output_t *out = data->out;
Expand All @@ -267,9 +267,15 @@ ipc_connect(data_t *data, enum pcmk_ipc_server server, pcmk_ipc_callback_t cb,

rc = pcmk_connect_ipc(api, dispatch_type);
if (rc != pcmk_rc_ok) {
out->err(out, "error: Could not connect to %s: %s",
pcmk_ipc_name(api, true),
pcmk_rc_str(rc));
if ((rc == EREMOTEIO) && eremoteio_ok) {
/* EREMOTEIO may be expected and acceptable for some callers.
* Preserve the return code in case callers need to handle it
* specially.
*/
} else {
out->err(out, "error: Could not connect to %s: %s",
pcmk_ipc_name(api, true), pcmk_rc_str(rc));
}
data->rc = rc;
pcmk_free_ipc_api(api);
return NULL;
Expand All @@ -296,7 +302,8 @@ pcmk__controller_status(pcmk__output_t *out, char *dest_node, guint message_time
dispatch_type = pcmk_ipc_dispatch_sync;
}
controld_api = ipc_connect(&data, pcmk_ipc_controld,
controller_status_event_cb, dispatch_type);
controller_status_event_cb, dispatch_type,
false);

if (controld_api != NULL) {
int rc = pcmk_controld_api_ping(controld_api, dest_node);
Expand Down Expand Up @@ -352,7 +359,8 @@ pcmk__designated_controller(pcmk__output_t *out, guint message_timeout_ms)
dispatch_type = pcmk_ipc_dispatch_sync;
}
controld_api = ipc_connect(&data, pcmk_ipc_controld,
designated_controller_event_cb, dispatch_type);
designated_controller_event_cb, dispatch_type,
false);

if (controld_api != NULL) {
int rc = pcmk_controld_api_ping(controld_api, NULL);
Expand Down Expand Up @@ -407,6 +415,11 @@ pcmk_designated_controller(xmlNodePtr *xml, unsigned int message_timeout_ms)
* not \p NULL
*
* \return Standard Pacemaker return code
*
* \note This function returns \p EREMOTEIO if run on a Pacemaker Remote node,
* since \p pacemakerd is not proxied to remote nodes. The fencer and CIB
* may still be accessible, but \p state will be
* \p pcmk_pacemakerd_state_invalid.
*/
int
pcmk__pacemakerd_status(pcmk__output_t *out, const char *ipc_name,
Expand All @@ -428,7 +441,7 @@ pcmk__pacemakerd_status(pcmk__output_t *out, const char *ipc_name,
dispatch_type = pcmk_ipc_dispatch_sync;
}
pacemakerd_api = ipc_connect(&data, pcmk_ipc_pacemakerd,
pacemakerd_event_cb, dispatch_type);
pacemakerd_event_cb, dispatch_type, true);

if (pacemakerd_api != NULL) {
int rc = pcmk_pacemakerd_api_ping(pacemakerd_api, ipc_name);
Expand Down
122 changes: 41 additions & 81 deletions lib/pacemaker/pcmk_status.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,71 +70,6 @@ fencing_connect(void)
}
}

static void
pacemakerd_event_cb(pcmk_ipc_api_t *pacemakerd_api,
enum pcmk_ipc_event event_type, crm_exit_t status,
void *event_data, void *user_data)
{
pcmk_pacemakerd_api_reply_t *reply = event_data;
enum pcmk_pacemakerd_state *state =
(enum pcmk_pacemakerd_state *) user_data;

/* we are just interested in the latest reply */
*state = pcmk_pacemakerd_state_invalid;

if (event_type != pcmk_ipc_event_reply || status != CRM_EX_OK) {
return;
}

if (reply->reply_type == pcmk_pacemakerd_reply_ping &&
reply->data.ping.last_good != (time_t) 0 &&
reply->data.ping.status == pcmk_rc_ok) {
*state = reply->data.ping.state;
}
}

static int
pacemakerd_status(pcmk__output_t *out)
{
int rc = pcmk_rc_ok;
pcmk_ipc_api_t *pacemakerd_api = NULL;
enum pcmk_pacemakerd_state state = pcmk_pacemakerd_state_invalid;

rc = pcmk_new_ipc_api(&pacemakerd_api, pcmk_ipc_pacemakerd);
if (pacemakerd_api == NULL) {
out->err(out, "Could not connect to pacemakerd: %s",
pcmk_rc_str(rc));
return rc;
}

pcmk_register_ipc_callback(pacemakerd_api, pacemakerd_event_cb, (void *) &state);

rc = pcmk_connect_ipc(pacemakerd_api, pcmk_ipc_dispatch_sync);
if (rc == EREMOTEIO) {
return pcmk_rc_ok;
} else if (rc != pcmk_rc_ok) {
out->err(out, "Could not connect to pacemakerd: %s",
pcmk_rc_str(rc));
pcmk_free_ipc_api(pacemakerd_api);
return rc;
}

rc = pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name);

if (rc != pcmk_rc_ok) {
/* Got some error from pcmk_pacemakerd_api_ping, so return it. */
} else if (state == pcmk_pacemakerd_state_running) {
rc = pcmk_rc_ok;
} else if (state == pcmk_pacemakerd_state_shutting_down) {
rc = ENOTCONN;
} else {
rc = EAGAIN;
}

pcmk_free_ipc_api(pacemakerd_api);
return rc;
}

/*!
* \internal
* \brief Output the cluster status given a fencer and CIB connection
Expand Down Expand Up @@ -257,7 +192,7 @@ pcmk_status(xmlNodePtr *xml)
stonith__register_messages(out);

rc = pcmk__status(out, cib, pcmk__fence_history_full, pcmk_section_all,
show_opts, NULL, NULL, NULL, false);
show_opts, NULL, NULL, NULL, false, 0);
pcmk__xml_output_finish(out, xml);

cib_delete(cib);
Expand Down Expand Up @@ -289,41 +224,63 @@ pcmk_status(xmlNodePtr *xml)
* \param[in] simple_output Whether to use a simple output format.
* Note: This is for use by \p crm_mon only
* and is planned to be deprecated.
* \param[in] timeout_ms How long to wait for a reply from the
* \p pacemakerd API. If 0,
* \p pcmk_ipc_dispatch_sync will be used.
* If positive, \p pcmk_ipc_dispatch_main
* will be used, and a new mainloop will be
* created for this purpose (freed before
* return).
*
* \return Standard Pacemaker return code
*/
int
pcmk__status(pcmk__output_t *out, cib_t *cib,
enum pcmk__fence_history fence_history, uint32_t show,
uint32_t show_opts, const char *only_node, const char *only_rsc,
const char *neg_location_prefix, bool simple_output)
const char *neg_location_prefix, bool simple_output,
guint timeout_ms)
{
xmlNode *current_cib = NULL;
int rc = pcmk_rc_ok;
stonith_t *stonith = NULL;
enum pcmk_pacemakerd_state state = pcmk_pacemakerd_state_invalid;

if (cib == NULL) {
return ENOTCONN;
}

if (cib->variant == cib_native) {
if (cib->state == cib_connected_query || cib->state == cib_connected_command) {
rc = pcmk_rc_ok;
} else {
rc = pacemakerd_status(out);
if ((cib->variant == cib_native)
&& (cib->state != cib_connected_query)
&& (cib->state != cib_connected_command)) {

rc = pcmk__pacemakerd_status(out, crm_system_name,
(timeout_ms < 0)? 0 : timeout_ms,
&state);
switch (rc) {
case pcmk_rc_ok:
switch (state) {
case pcmk_pacemakerd_state_running:
case pcmk_pacemakerd_state_shutting_down:
// CIB may still be available while shutting down
break;
default:
return rc;
}
break;
case EREMOTEIO:
/* We'll always get EREMOTEIO if we run this on a Pacemaker
* Remote node. The fencer and CIB might be available.
*/
rc = pcmk_rc_ok;
break;
default:
return rc;
}
}

if (rc != pcmk_rc_ok) {
return rc;
}

if (fence_history != pcmk__fence_history_none && cib->variant == cib_native) {
stonith = fencing_connect();

if (stonith == NULL) {
return ENOTCONN;
}
}

rc = cib_connect(out, cib, &current_cib);
Expand All @@ -335,6 +292,9 @@ pcmk__status(pcmk__output_t *out, cib_t *cib,
fence_history, show, show_opts, only_node,
only_rsc, neg_location_prefix,
simple_output);
if (rc != pcmk_rc_ok) {
out->err(out, "Error outputting status info from the fencer or CIB");
}

done:
if (stonith != NULL) {
Expand All @@ -350,7 +310,7 @@ pcmk__status(pcmk__output_t *out, cib_t *cib,
free_xml(current_cib);
}

return rc;
return pcmk_rc_ok;
}

/* This is an internal-only function that is planned to be deprecated and removed.
Expand Down
2 changes: 1 addition & 1 deletion tools/crm_mon.c
Original file line number Diff line number Diff line change
Expand Up @@ -1333,7 +1333,7 @@ one_shot(void)
int rc = pcmk__status(out, cib, fence_history, show, show_opts,
options.only_node, options.only_rsc,
options.neg_location_prefix,
output_format == mon_output_monitor);
output_format == mon_output_monitor, 0);

if (rc == pcmk_rc_ok) {
clean_up(pcmk_rc2exitc(rc));
Expand Down

0 comments on commit 6dd3934

Please sign in to comment.