Skip to content

Commit

Permalink
DAOS-14739 cart: Add SWIM stats for delay/glitches (#13587)
Browse files Browse the repository at this point in the history
Adds a gauge to measure SWIM delay and a counter
for glitches (temporary network outages).

Change-Id: Ibd85c08ab3e3a38931d795d62270f3e4059d7c67
Required-githooks: true

Change-Id: I854937dd249ad9f7211a3b7d40d3365a3e2f79f2
Signed-off-by: Michael MacDonald <[email protected]>
  • Loading branch information
mjmac authored and jolivier23 committed Apr 10, 2024
1 parent 654af60 commit 2360d90
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 4 deletions.
14 changes: 12 additions & 2 deletions src/cart/crt_context.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* (C) Copyright 2016-2023 Intel Corporation.
* (C) Copyright 2016-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -329,13 +329,23 @@ crt_context_provider_create(crt_context_t *crt_ctx, crt_provider_t provider, boo
prov, ctx->cc_idx);
if (ret)
DL_WARN(rc, "Failed to create rpc waitq gauge");

ret = d_tm_add_metric(&ctx->cc_quotas.rpc_quota_exceeded, D_TM_COUNTER,
"Total number of exceeded RPC quota errors",
"errors", "net/%s/quota_exceeded/ctx_%u",
prov, ctx->cc_idx);
if (ret)
DL_WARN(rc, "Failed to create quota exceeded counter");
ret = d_tm_add_metric(&ctx->cc_net_glitches, D_TM_COUNTER,
"Total number of network glitch errors", "errors",
"net/%s/glitch/ctx_%u", prov, ctx->cc_idx);
if (ret)
DL_WARN(rc, "Failed to create network glitch counter");

ret = d_tm_add_metric(&ctx->cc_swim_delay, D_TM_STATS_GAUGE,
"SWIM delay measurements", "delay",
"net/%s/swim_delay/ctx_%u", prov, ctx->cc_idx);
if (ret)
DL_WARN(rc, "Failed to create SWIM delay gauge");
}

if (crt_is_service() &&
Expand Down
6 changes: 5 additions & 1 deletion src/cart/crt_internal_types.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* (C) Copyright 2016-2023 Intel Corporation.
* (C) Copyright 2016-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -237,6 +237,10 @@ struct crt_context {
struct d_tm_node_t *cc_timedout_uri;
/** Total number of failed address resolution, of type counter */
struct d_tm_node_t *cc_failed_addr;
/** Counter for number of network glitches */
struct d_tm_node_t *cc_net_glitches;
/** Stats gauge of reported SWIM delays */
struct d_tm_node_t *cc_swim_delay;

/** Stores self uri for the current context */
char cc_self_uri[CRT_ADDR_STR_MAX_LEN];
Expand Down
23 changes: 22 additions & 1 deletion src/cart/crt_swim.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* (C) Copyright 2019-2023 Intel Corporation.
* (C) Copyright 2019-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1005,6 +1005,24 @@ static void crt_swim_update_last_unpack_hlc(struct crt_swim_membs *csm)
D_RWLOCK_UNLOCK(&crt_gdata.cg_rwlock);
}

static void
crt_metrics_sample_delay(crt_context_t crt_ctx, uint64_t delay, bool glitch)
{
struct crt_context *ctx;

if (unlikely(crt_ctx == CRT_CONTEXT_NULL)) {
D_ERROR("invalid parameter (NULL crt_ctx).\n");
return;
}

ctx = crt_ctx;

d_tm_set_gauge(ctx->cc_swim_delay, delay);

if (glitch)
d_tm_inc_counter(ctx->cc_net_glitches, 1);
}

static int64_t crt_swim_progress_cb(crt_context_t crt_ctx, int64_t timeout_us, void *arg)
{
struct crt_grp_priv *grp_priv = crt_gdata.cg_grp->gg_primary_grp;
Expand Down Expand Up @@ -1053,12 +1071,15 @@ static int64_t crt_swim_progress_cb(crt_context_t crt_ctx, int64_t timeout_us, v
swim_net_glitch_update(csm->csm_ctx, self_id, delay);
csm->csm_last_unpack_hlc = hlc2;
}
crt_metrics_sample_delay(crt_ctx, delay, delay > max_delay);
}

if (now < ctx->sc_next_event)
timeout_us = min(timeout_us, (ctx->sc_next_event - now) * 1000);
} else if (rc) {
D_ERROR("swim_progress(): "DF_RC"\n", DP_RC(rc));
} else {
crt_metrics_sample_delay(crt_ctx, 0, false);
}

return timeout_us;
Expand Down
6 changes: 6 additions & 0 deletions src/tests/ftest/util/telemetry_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,8 +298,14 @@ class TelemetryUtils():
ENGINE_IO_OPS_TGT_UPDATE_ACTIVE_METRICS +\
ENGINE_IO_OPS_UPDATE_ACTIVE_METRICS
ENGINE_NET_METRICS = [
"engine_net_glitch",
"engine_net_failed_addr",
"engine_net_req_timeout",
"engine_net_swim_delay_stddev",
"engine_net_swim_delay_max",
"engine_net_swim_delay_mean",
"engine_net_swim_delay",
"engine_net_swim_delay_min",
"engine_net_uri_lookup_timeout",
"engine_net_uri_lookup_other",
"engine_net_uri_lookup_self"]
Expand Down

0 comments on commit 2360d90

Please sign in to comment.