Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-16824 cart: lower error messages to warn level #15529

Merged
merged 8 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 13 additions & 35 deletions src/cart/crt_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -534,8 +534,8 @@ crt_rpc_complete_and_unlock(struct crt_rpc_priv *rpc_priv, int rc)
cbinfo.cci_rc = rpc_priv->crp_reply_hdr.cch_rc;

if (cbinfo.cci_rc != 0)
RPC_CERROR(crt_quiet_error(cbinfo.cci_rc), DB_NET, rpc_priv,
"failed, " DF_RC "\n", DP_RC(cbinfo.cci_rc));
RPC_CWARN(crt_quiet_error(cbinfo.cci_rc), DB_NET, rpc_priv,
"failed, " DF_RC "\n", DP_RC(cbinfo.cci_rc));

RPC_TRACE(DB_TRACE, rpc_priv,
"Invoking RPC callback (rank %d tag %d) rc: "
Expand Down Expand Up @@ -1227,8 +1227,7 @@ crt_context_timeout_check(struct crt_context *crt_ctx)
struct d_binheap_node *bh_node;
d_list_t timeout_list;
uint64_t ts_now;
int err_to_print = 0;
int left_to_print = 0;
bool print_once = false;

D_ASSERT(crt_ctx != NULL);

Expand Down Expand Up @@ -1256,47 +1255,26 @@ crt_context_timeout_check(struct crt_context *crt_ctx)
};
D_MUTEX_UNLOCK(&crt_ctx->cc_mutex);

/* Limit logging when many rpcs time-out at the same time */
d_list_for_each_entry(rpc_priv, &timeout_list, crp_tmp_link_timeout) {
left_to_print++;
}

/* TODO: might expose via env in future */
err_to_print = 1;

/* handle the timeout RPCs */
while ((rpc_priv =
d_list_pop_entry(&timeout_list, struct crt_rpc_priv, crp_tmp_link_timeout))) {
/*
* This error is annoying and fills the logs. For now prevent bursts of timeouts
* happening at the same time.
*
* Ideally we would also limit to 1 error per target. Can keep track of it in per
* target cache used for hg_addrs.
*
* Extra lookup cost of cache entry would be ok as this is an error case
**/

if (err_to_print > 0) {
RPC_ERROR(rpc_priv,
"ctx_id %d, (status: %#x) timed out (%d seconds), "
"target (%d:%d)\n",
crt_ctx->cc_idx, rpc_priv->crp_state, rpc_priv->crp_timeout_sec,
rpc_priv->crp_pub.cr_ep.ep_rank, rpc_priv->crp_pub.cr_ep.ep_tag);
err_to_print--;
left_to_print--;

if (err_to_print == 0 && left_to_print > 0)
D_ERROR(" %d more rpcs timed out. rest logged at INFO level\n",
left_to_print);
/* NB: The reason that the error message is printed at INFO
frostedcmos marked this conversation as resolved.
Show resolved Hide resolved
* level is because the user should know how serious the error
* is and they will print the RPC error at appropriate level */
if (!print_once) {
print_once = true;

RPC_WARN(rpc_priv,
"ctx_id %d, (status: %#x) timed out (%d seconds), "
"target (%d:%d)\n",
crt_ctx->cc_idx, rpc_priv->crp_state, rpc_priv->crp_timeout_sec,
rpc_priv->crp_pub.cr_ep.ep_rank, rpc_priv->crp_pub.cr_ep.ep_tag);
} else {
RPC_INFO(rpc_priv,
"ctx_id %d, (status: %#x) timed out (%d seconds), "
"target (%d:%d)\n",
crt_ctx->cc_idx, rpc_priv->crp_state, rpc_priv->crp_timeout_sec,
rpc_priv->crp_pub.cr_ep.ep_rank, rpc_priv->crp_pub.cr_ep.ep_tag);
left_to_print--;
}

crt_req_timeout_hdlr(rpc_priv);
Expand Down
8 changes: 3 additions & 5 deletions src/cart/crt_hg.c
Original file line number Diff line number Diff line change
Expand Up @@ -1154,9 +1154,7 @@ crt_rpc_handler_common(hg_handle_t hg_hdl)
else
rc = crt_corpc_common_hdlr(rpc_priv);
if (unlikely(rc != 0)) {
RPC_ERROR(rpc_priv,
"failed to invoke RPC handler, rc: "DF_RC"\n",
DP_RC(rc));
RPC_INFO(rpc_priv, "failed to invoke RPC handler, rc: " DF_RC "\n", DP_RC(rc));
crt_hg_reply_error_send(rpc_priv, rc);
D_GOTO(decref, hg_ret = HG_SUCCESS);
}
Expand Down Expand Up @@ -1364,8 +1362,8 @@ crt_hg_req_send_cb(const struct hg_cb_info *hg_cbinfo)
crt_cbinfo.cci_rc = rc;

if (crt_cbinfo.cci_rc != 0)
RPC_CERROR(crt_quiet_error(crt_cbinfo.cci_rc), DB_NET, rpc_priv,
"RPC failed; rc: " DF_RC "\n", DP_RC(crt_cbinfo.cci_rc));
RPC_CWARN(crt_quiet_error(crt_cbinfo.cci_rc), DB_NET, rpc_priv,
"RPC failed; rc: " DF_RC "\n", DP_RC(crt_cbinfo.cci_rc));

RPC_TRACE(DB_TRACE, rpc_priv,
"Invoking RPC callback (rank %d tag %d) rc: " DF_RC "\n",
Expand Down
12 changes: 4 additions & 8 deletions src/cart/crt_hg_proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -712,14 +712,10 @@ crt_proc_out_common(crt_proc_t proc, crt_rpc_output_t *data)

rc2 = rpc_priv->crp_reply_hdr.cch_rc;
if (rc2 != 0) {
if (rpc_priv->crp_reply_hdr.cch_rc != -DER_GRPVER)
RPC_ERROR(rpc_priv,
"RPC failed to execute on target. "
"error code: "DF_RC"\n", DP_RC(rc2));
else
RPC_TRACE(DB_NET, rpc_priv,
"RPC failed to execute on target. "
"error code: "DF_RC"\n", DP_RC(rc2));
RPC_CWARN(crt_quiet_error(rc2), DB_NET, rpc_priv,
"RPC failed to execute on target. "
"error code: " DF_RC "\n",
DP_RC(rc2));

D_GOTO(out, rc);
}
Expand Down
8 changes: 8 additions & 0 deletions src/cart/crt_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,14 @@
RPC_ERROR(rpc, fmt, ## __VA_ARGS__); \
} while (0)

#define RPC_CWARN(cond, mask, rpc, fmt, ...) \
do { \
if (cond) \
RPC_TRACE(mask, rpc, fmt, ##__VA_ARGS__); \
else \
RPC_WARN(rpc, fmt, ##__VA_ARGS__); \
} while (0)

#ifdef CRT_DEBUG_TRACE
# define CRT_ENTRY() \
D_DEBUG(DB_TRACE, ">>>> Entered %s: %d\n", __func__, __LINE__)
Expand Down
4 changes: 2 additions & 2 deletions src/cart/crt_register.c
Original file line number Diff line number Diff line change
Expand Up @@ -546,8 +546,8 @@ proto_query_cb(const struct crt_cb_info *cb_info)
D_FREE(rpc_req_input->pq_ver.iov_buf);

if (cb_info->cci_rc != 0) {
D_ERROR("rpc (opc: %#x) failed: "DF_RC"\n", rpc_req->cr_opc,
DP_RC(cb_info->cci_rc));
D_WARN("rpc (opc: %#x) failed: " DF_RC "\n", rpc_req->cr_opc,
DP_RC(cb_info->cci_rc));
D_GOTO(out, user_cb_info.pq_rc = cb_info->cci_rc);
}

Expand Down
8 changes: 4 additions & 4 deletions src/cart/crt_rpc.c
Original file line number Diff line number Diff line change
Expand Up @@ -887,8 +887,8 @@ uri_lookup_cb(const struct crt_cb_info *cb_info)
ul_out = crt_reply_get(lookup_rpc);

if (ul_out->ul_rc != 0) {
RPC_ERROR(chained_rpc_priv, "URI_LOOKUP returned rc="DF_RC"\n",
DP_RC(ul_out->ul_rc));
RPC_WARN(chained_rpc_priv, "URI_LOOKUP returned rc=" DF_RC "\n",
DP_RC(ul_out->ul_rc));
D_GOTO(retry, rc = ul_out->ul_rc);
}

Expand Down Expand Up @@ -973,8 +973,8 @@ uri_lookup_cb(const struct crt_cb_info *cb_info)
chained_rpc_priv);
D_GOTO(out, rc);
} else {
D_ERROR("URI lookups exceeded %d retries\n",
chained_rpc_priv->crp_ul_retry);
D_WARN("URI lookups exceeded %d retries\n",
chained_rpc_priv->crp_ul_retry);
}
} else {
DL_INFO(rc, "URI_LOOKUP for (%d:%d) failed during PROTO_QUERY",
Expand Down
Loading