Skip to content

Commit

Permalink
DAOS-16824 cart: lower error messages to warn level (#15529) (#15893)
Browse files Browse the repository at this point in the history
lower error messages to warn level

Signed-off-by: Jinshan Xiong <[email protected]>
  • Loading branch information
jxiong authored Feb 25, 2025
1 parent 9977cb9 commit 6fcf01f
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 54 deletions.
48 changes: 13 additions & 35 deletions src/cart/crt_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -522,8 +522,8 @@ crt_rpc_complete_and_unlock(struct crt_rpc_priv *rpc_priv, int rc)
cbinfo.cci_rc = rpc_priv->crp_reply_hdr.cch_rc;

if (cbinfo.cci_rc != 0)
RPC_CERROR(crt_quiet_error(cbinfo.cci_rc), DB_NET, rpc_priv,
"failed, " DF_RC "\n", DP_RC(cbinfo.cci_rc));
RPC_CWARN(crt_quiet_error(cbinfo.cci_rc), DB_NET, rpc_priv,
"failed, " DF_RC "\n", DP_RC(cbinfo.cci_rc));

RPC_TRACE(DB_TRACE, rpc_priv,
"Invoking RPC callback (rank %d tag %d) rc: "
Expand Down Expand Up @@ -1194,8 +1194,7 @@ crt_context_timeout_check(struct crt_context *crt_ctx)
struct d_binheap_node *bh_node;
d_list_t timeout_list;
uint64_t ts_now;
int err_to_print = 0;
int left_to_print = 0;
bool print_once = false;

D_ASSERT(crt_ctx != NULL);

Expand Down Expand Up @@ -1223,47 +1222,26 @@ crt_context_timeout_check(struct crt_context *crt_ctx)
};
D_MUTEX_UNLOCK(&crt_ctx->cc_mutex);

/* Limit logging when many rpcs time-out at the same time */
d_list_for_each_entry(rpc_priv, &timeout_list, crp_tmp_link_timeout) {
left_to_print++;
}

/* TODO: might expose via env in future */
err_to_print = 1;

/* handle the timeout RPCs */
while ((rpc_priv =
d_list_pop_entry(&timeout_list, struct crt_rpc_priv, crp_tmp_link_timeout))) {
/*
* This error is annoying and fills the logs. For now prevent bursts of timeouts
* happening at the same time.
*
* Ideally we would also limit to 1 error per target. Can keep track of it in per
* target cache used for hg_addrs.
*
* Extra lookup cost of cache entry would be ok as this is an error case
**/

if (err_to_print > 0) {
RPC_ERROR(rpc_priv,
"ctx_id %d, (status: %#x) timed out (%d seconds), "
"target (%d:%d)\n",
crt_ctx->cc_idx, rpc_priv->crp_state, rpc_priv->crp_timeout_sec,
rpc_priv->crp_pub.cr_ep.ep_rank, rpc_priv->crp_pub.cr_ep.ep_tag);
err_to_print--;
left_to_print--;

if (err_to_print == 0 && left_to_print > 0)
D_ERROR(" %d more rpcs timed out. rest logged at INFO level\n",
left_to_print);
/* NB: The reason that the error message is printed at INFO
* level is because the user should know how serious the error
* is and they will print the RPC error at appropriate level */
if (!print_once) {
print_once = true;

RPC_WARN(rpc_priv,
"ctx_id %d, (status: %#x) timed out (%d seconds), "
"target (%d:%d)\n",
crt_ctx->cc_idx, rpc_priv->crp_state, rpc_priv->crp_timeout_sec,
rpc_priv->crp_pub.cr_ep.ep_rank, rpc_priv->crp_pub.cr_ep.ep_tag);
} else {
RPC_INFO(rpc_priv,
"ctx_id %d, (status: %#x) timed out (%d seconds), "
"target (%d:%d)\n",
crt_ctx->cc_idx, rpc_priv->crp_state, rpc_priv->crp_timeout_sec,
rpc_priv->crp_pub.cr_ep.ep_rank, rpc_priv->crp_pub.cr_ep.ep_tag);
left_to_print--;
}

crt_req_timeout_hdlr(rpc_priv);
Expand Down
8 changes: 3 additions & 5 deletions src/cart/crt_hg.c
Original file line number Diff line number Diff line change
Expand Up @@ -1157,9 +1157,7 @@ crt_rpc_handler_common(hg_handle_t hg_hdl)
else
rc = crt_corpc_common_hdlr(rpc_priv);
if (unlikely(rc != 0)) {
RPC_ERROR(rpc_priv,
"failed to invoke RPC handler, rc: "DF_RC"\n",
DP_RC(rc));
RPC_INFO(rpc_priv, "failed to invoke RPC handler, rc: " DF_RC "\n", DP_RC(rc));
crt_hg_reply_error_send(rpc_priv, rc);
D_GOTO(decref, hg_ret = HG_SUCCESS);
}
Expand Down Expand Up @@ -1367,8 +1365,8 @@ crt_hg_req_send_cb(const struct hg_cb_info *hg_cbinfo)
crt_cbinfo.cci_rc = rc;

if (crt_cbinfo.cci_rc != 0)
RPC_CERROR(crt_quiet_error(crt_cbinfo.cci_rc), DB_NET, rpc_priv,
"RPC failed; rc: " DF_RC "\n", DP_RC(crt_cbinfo.cci_rc));
RPC_CWARN(crt_quiet_error(crt_cbinfo.cci_rc), DB_NET, rpc_priv,
"RPC failed; rc: " DF_RC "\n", DP_RC(crt_cbinfo.cci_rc));

RPC_TRACE(DB_TRACE, rpc_priv,
"Invoking RPC callback (rank %d tag %d) rc: " DF_RC "\n",
Expand Down
12 changes: 4 additions & 8 deletions src/cart/crt_hg_proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -712,14 +712,10 @@ crt_proc_out_common(crt_proc_t proc, crt_rpc_output_t *data)

rc2 = rpc_priv->crp_reply_hdr.cch_rc;
if (rc2 != 0) {
if (rpc_priv->crp_reply_hdr.cch_rc != -DER_GRPVER)
RPC_ERROR(rpc_priv,
"RPC failed to execute on target. "
"error code: "DF_RC"\n", DP_RC(rc2));
else
RPC_TRACE(DB_NET, rpc_priv,
"RPC failed to execute on target. "
"error code: "DF_RC"\n", DP_RC(rc2));
RPC_CWARN(crt_quiet_error(rc2), DB_NET, rpc_priv,
"RPC failed to execute on target. "
"error code: " DF_RC "\n",
DP_RC(rc2));

D_GOTO(out, rc);
}
Expand Down
8 changes: 8 additions & 0 deletions src/cart/crt_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,14 @@
RPC_ERROR(rpc, fmt, ## __VA_ARGS__); \
} while (0)

#define RPC_CWARN(cond, mask, rpc, fmt, ...) \
do { \
if (cond) \
RPC_TRACE(mask, rpc, fmt, ##__VA_ARGS__); \
else \
RPC_WARN(rpc, fmt, ##__VA_ARGS__); \
} while (0)

#ifdef CRT_DEBUG_TRACE
# define CRT_ENTRY() \
D_DEBUG(DB_TRACE, ">>>> Entered %s: %d\n", __func__, __LINE__)
Expand Down
4 changes: 2 additions & 2 deletions src/cart/crt_register.c
Original file line number Diff line number Diff line change
Expand Up @@ -546,8 +546,8 @@ proto_query_cb(const struct crt_cb_info *cb_info)
D_FREE(rpc_req_input->pq_ver.iov_buf);

if (cb_info->cci_rc != 0) {
D_ERROR("rpc (opc: %#x) failed: "DF_RC"\n", rpc_req->cr_opc,
DP_RC(cb_info->cci_rc));
D_WARN("rpc (opc: %#x) failed: " DF_RC "\n", rpc_req->cr_opc,
DP_RC(cb_info->cci_rc));
D_GOTO(out, user_cb_info.pq_rc = cb_info->cci_rc);
}

Expand Down
8 changes: 4 additions & 4 deletions src/cart/crt_rpc.c
Original file line number Diff line number Diff line change
Expand Up @@ -887,8 +887,8 @@ uri_lookup_cb(const struct crt_cb_info *cb_info)
ul_out = crt_reply_get(lookup_rpc);

if (ul_out->ul_rc != 0) {
RPC_ERROR(chained_rpc_priv, "URI_LOOKUP returned rc="DF_RC"\n",
DP_RC(ul_out->ul_rc));
RPC_WARN(chained_rpc_priv, "URI_LOOKUP returned rc=" DF_RC "\n",
DP_RC(ul_out->ul_rc));
D_GOTO(retry, rc = ul_out->ul_rc);
}

Expand Down Expand Up @@ -973,8 +973,8 @@ uri_lookup_cb(const struct crt_cb_info *cb_info)
chained_rpc_priv);
D_GOTO(out, rc);
} else {
D_ERROR("URI lookups exceeded %d retries\n",
chained_rpc_priv->crp_ul_retry);
D_WARN("URI lookups exceeded %d retries\n",
chained_rpc_priv->crp_ul_retry);
}
} else {
DL_INFO(rc, "URI_LOOKUP for (%d:%d) failed during PROTO_QUERY",
Expand Down

0 comments on commit 6fcf01f

Please sign in to comment.