From 9a4bec311286a1439483e65ae8dcb067ec4c14f7 Mon Sep 17 00:00:00 2001 From: Jinshan Xiong Date: Tue, 3 Dec 2024 07:48:07 -0800 Subject: [PATCH] DAOS-16824 cart: lower error messages to warn level (#15529) * lower error messages to warn level Change-Id: I71966a235c068abbd3c74c422458a68e3a86154a Signed-off-by: Jinshan Xiong --- src/cart/crt_context.c | 48 +++++++++++------------------------------ src/cart/crt_hg.c | 8 +++---- src/cart/crt_hg_proc.c | 12 ++++------- src/cart/crt_internal.h | 8 +++++++ src/cart/crt_register.c | 4 ++-- src/cart/crt_rpc.c | 8 +++---- 6 files changed, 34 insertions(+), 54 deletions(-) diff --git a/src/cart/crt_context.c b/src/cart/crt_context.c index 2927ce75150..610b6fae1a6 100644 --- a/src/cart/crt_context.c +++ b/src/cart/crt_context.c @@ -534,8 +534,8 @@ crt_rpc_complete_and_unlock(struct crt_rpc_priv *rpc_priv, int rc) cbinfo.cci_rc = rpc_priv->crp_reply_hdr.cch_rc; if (cbinfo.cci_rc != 0) - RPC_CERROR(crt_quiet_error(cbinfo.cci_rc), DB_NET, rpc_priv, - "failed, " DF_RC "\n", DP_RC(cbinfo.cci_rc)); + RPC_CWARN(crt_quiet_error(cbinfo.cci_rc), DB_NET, rpc_priv, + "failed, " DF_RC "\n", DP_RC(cbinfo.cci_rc)); RPC_TRACE(DB_TRACE, rpc_priv, "Invoking RPC callback (rank %d tag %d) rc: " @@ -1227,8 +1227,7 @@ crt_context_timeout_check(struct crt_context *crt_ctx) struct d_binheap_node *bh_node; d_list_t timeout_list; uint64_t ts_now; - int err_to_print = 0; - int left_to_print = 0; + bool print_once = false; D_ASSERT(crt_ctx != NULL); @@ -1256,47 +1255,26 @@ crt_context_timeout_check(struct crt_context *crt_ctx) }; D_MUTEX_UNLOCK(&crt_ctx->cc_mutex); - /* Limit logging when many rpcs time-out at the same time */ - d_list_for_each_entry(rpc_priv, &timeout_list, crp_tmp_link_timeout) { - left_to_print++; - } - - /* TODO: might expose via env in future */ - err_to_print = 1; - /* handle the timeout RPCs */ while ((rpc_priv = d_list_pop_entry(&timeout_list, struct crt_rpc_priv, crp_tmp_link_timeout))) { - /* - * This error is annoying and fills the logs. For now prevent bursts of timeouts - * happening at the same time. - * - * Ideally we would also limit to 1 error per target. Can keep track of it in per - * target cache used for hg_addrs. - * - * Extra lookup cost of cache entry would be ok as this is an error case - **/ - - if (err_to_print > 0) { - RPC_ERROR(rpc_priv, - "ctx_id %d, (status: %#x) timed out (%d seconds), " - "target (%d:%d)\n", - crt_ctx->cc_idx, rpc_priv->crp_state, rpc_priv->crp_timeout_sec, - rpc_priv->crp_pub.cr_ep.ep_rank, rpc_priv->crp_pub.cr_ep.ep_tag); - err_to_print--; - left_to_print--; - - if (err_to_print == 0 && left_to_print > 0) - D_ERROR(" %d more rpcs timed out. rest logged at INFO level\n", - left_to_print); + /* NB: The reason that the error message is printed at INFO + * level is because the user should know how serious the error + * is and they will print the RPC error at appropriate level */ + if (!print_once) { + print_once = true; + RPC_WARN(rpc_priv, + "ctx_id %d, (status: %#x) timed out (%d seconds), " + "target (%d:%d)\n", + crt_ctx->cc_idx, rpc_priv->crp_state, rpc_priv->crp_timeout_sec, + rpc_priv->crp_pub.cr_ep.ep_rank, rpc_priv->crp_pub.cr_ep.ep_tag); } else { RPC_INFO(rpc_priv, "ctx_id %d, (status: %#x) timed out (%d seconds), " "target (%d:%d)\n", crt_ctx->cc_idx, rpc_priv->crp_state, rpc_priv->crp_timeout_sec, rpc_priv->crp_pub.cr_ep.ep_rank, rpc_priv->crp_pub.cr_ep.ep_tag); - left_to_print--; } crt_req_timeout_hdlr(rpc_priv); diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c index 59311cf691b..8f2395a44ff 100644 --- a/src/cart/crt_hg.c +++ b/src/cart/crt_hg.c @@ -1154,9 +1154,7 @@ crt_rpc_handler_common(hg_handle_t hg_hdl) else rc = crt_corpc_common_hdlr(rpc_priv); if (unlikely(rc != 0)) { - RPC_ERROR(rpc_priv, - "failed to invoke RPC handler, rc: "DF_RC"\n", - DP_RC(rc)); + RPC_INFO(rpc_priv, "failed to invoke RPC handler, rc: " DF_RC "\n", DP_RC(rc)); crt_hg_reply_error_send(rpc_priv, rc); D_GOTO(decref, hg_ret = HG_SUCCESS); } @@ -1364,8 +1362,8 @@ crt_hg_req_send_cb(const struct hg_cb_info *hg_cbinfo) crt_cbinfo.cci_rc = rc; if (crt_cbinfo.cci_rc != 0) - RPC_CERROR(crt_quiet_error(crt_cbinfo.cci_rc), DB_NET, rpc_priv, - "RPC failed; rc: " DF_RC "\n", DP_RC(crt_cbinfo.cci_rc)); + RPC_CWARN(crt_quiet_error(crt_cbinfo.cci_rc), DB_NET, rpc_priv, + "RPC failed; rc: " DF_RC "\n", DP_RC(crt_cbinfo.cci_rc)); RPC_TRACE(DB_TRACE, rpc_priv, "Invoking RPC callback (rank %d tag %d) rc: " DF_RC "\n", diff --git a/src/cart/crt_hg_proc.c b/src/cart/crt_hg_proc.c index 47d003ee822..fa6df8b7397 100644 --- a/src/cart/crt_hg_proc.c +++ b/src/cart/crt_hg_proc.c @@ -712,14 +712,10 @@ crt_proc_out_common(crt_proc_t proc, crt_rpc_output_t *data) rc2 = rpc_priv->crp_reply_hdr.cch_rc; if (rc2 != 0) { - if (rpc_priv->crp_reply_hdr.cch_rc != -DER_GRPVER) - RPC_ERROR(rpc_priv, - "RPC failed to execute on target. " - "error code: "DF_RC"\n", DP_RC(rc2)); - else - RPC_TRACE(DB_NET, rpc_priv, - "RPC failed to execute on target. " - "error code: "DF_RC"\n", DP_RC(rc2)); + RPC_CWARN(crt_quiet_error(rc2), DB_NET, rpc_priv, + "RPC failed to execute on target. " + "error code: " DF_RC "\n", + DP_RC(rc2)); D_GOTO(out, rc); } diff --git a/src/cart/crt_internal.h b/src/cart/crt_internal.h index 58418d770cd..d9cd17b156d 100644 --- a/src/cart/crt_internal.h +++ b/src/cart/crt_internal.h @@ -92,6 +92,14 @@ RPC_ERROR(rpc, fmt, ## __VA_ARGS__); \ } while (0) +#define RPC_CWARN(cond, mask, rpc, fmt, ...) \ + do { \ + if (cond) \ + RPC_TRACE(mask, rpc, fmt, ##__VA_ARGS__); \ + else \ + RPC_WARN(rpc, fmt, ##__VA_ARGS__); \ + } while (0) + #ifdef CRT_DEBUG_TRACE # define CRT_ENTRY() \ D_DEBUG(DB_TRACE, ">>>> Entered %s: %d\n", __func__, __LINE__) diff --git a/src/cart/crt_register.c b/src/cart/crt_register.c index 0a90bacfed1..fc0118ef787 100644 --- a/src/cart/crt_register.c +++ b/src/cart/crt_register.c @@ -546,8 +546,8 @@ proto_query_cb(const struct crt_cb_info *cb_info) D_FREE(rpc_req_input->pq_ver.iov_buf); if (cb_info->cci_rc != 0) { - D_ERROR("rpc (opc: %#x) failed: "DF_RC"\n", rpc_req->cr_opc, - DP_RC(cb_info->cci_rc)); + D_WARN("rpc (opc: %#x) failed: " DF_RC "\n", rpc_req->cr_opc, + DP_RC(cb_info->cci_rc)); D_GOTO(out, user_cb_info.pq_rc = cb_info->cci_rc); } diff --git a/src/cart/crt_rpc.c b/src/cart/crt_rpc.c index 2a7db9a90ad..b9effbeb9ce 100644 --- a/src/cart/crt_rpc.c +++ b/src/cart/crt_rpc.c @@ -887,8 +887,8 @@ uri_lookup_cb(const struct crt_cb_info *cb_info) ul_out = crt_reply_get(lookup_rpc); if (ul_out->ul_rc != 0) { - RPC_ERROR(chained_rpc_priv, "URI_LOOKUP returned rc="DF_RC"\n", - DP_RC(ul_out->ul_rc)); + RPC_WARN(chained_rpc_priv, "URI_LOOKUP returned rc=" DF_RC "\n", + DP_RC(ul_out->ul_rc)); D_GOTO(retry, rc = ul_out->ul_rc); } @@ -973,8 +973,8 @@ uri_lookup_cb(const struct crt_cb_info *cb_info) chained_rpc_priv); D_GOTO(out, rc); } else { - D_ERROR("URI lookups exceeded %d retries\n", - chained_rpc_priv->crp_ul_retry); + D_WARN("URI lookups exceeded %d retries\n", + chained_rpc_priv->crp_ul_retry); } } else { DL_INFO(rc, "URI_LOOKUP for (%d:%d) failed during PROTO_QUERY",