Skip to content

Commit

Permalink
DAOS-14739 cart: Add SWIM stats for delay/glitches
Browse files Browse the repository at this point in the history
Adds a gauge to measure network delay and a counter
for glitches (temporary network outages).

Run-GHA: true
Change-Id: I285e8806f9650eed10b1027f7c4c78755dfe7263
Required-githooks: true
Signed-off-by: Michael MacDonald <[email protected]>
  • Loading branch information
mjmac committed Jan 17, 2024
1 parent 8ca9d22 commit cce2084
Show file tree
Hide file tree
Showing 8 changed files with 80 additions and 8 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/rpm-build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ env:
# build is done on the lowest version and test on the highest with a "sanity test"
# stage done on all versions in the list ecept the highest
EL8_BUILD_VERSION: 8.6
EL8_VERSION: 8
EL8_VERSION: 8.8
EL9_BUILD_VERSION: 9
EL9_VERSION: 9
LEAP15_VERSION: 15.4
LEAP15_VERSION: 15.5

on:
workflow_dispatch:
Expand Down
2 changes: 1 addition & 1 deletion ci/functional/test_main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ test_cluster() {
FIRST_NODE=${first_node} \
TEST_RPMS=${TEST_RPMS} \
NODELIST=${tnodes} \
BUILD_URL=\"$BUILD_URL\" \
BUILD_URL=\"${BUILD_URL:-Unknown in GHA}\" \
STAGE_NAME=\"$STAGE_NAME\" \
$(cat ci/functional/test_main_prep_node.sh)"
}
Expand Down
2 changes: 1 addition & 1 deletion ci/gha_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ get_test_tags() {
local test_tags=()
local tags
# Test-tag: has higher priority
if [ -n "$CP_TEST_TAG" ]; then
if [ -n "${CP_TEST_TAG:-}" ]; then
tags="$CP_TEST_TAG"
else
tags="pr"
Expand Down
14 changes: 13 additions & 1 deletion src/cart/crt_context.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* (C) Copyright 2016-2023 Intel Corporation.
* (C) Copyright 2016-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -322,6 +322,18 @@ crt_context_provider_create(crt_context_t *crt_ctx, crt_provider_t provider, boo
if (ret)
D_WARN("Failed to create failed addr counter: "DF_RC
"\n", DP_RC(ret));

ret = d_tm_add_metric(&ctx->cc_net_glitches, D_TM_COUNTER,
"Total number of network glitch errors", "errors",
"net/%s/glitch/ctx_%u", prov, ctx->cc_idx);
if (ret)
DL_WARN(rc, "Failed to create network glitch counter");

ret = d_tm_add_metric(&ctx->cc_net_delay, D_TM_STATS_GAUGE,
"Network delay measurements", "delay", "net/%s/delay/ctx_%u",
prov, ctx->cc_idx);
if (ret)
DL_WARN(rc, "Failed to create network delay gauge");
}

if (crt_is_service() &&
Expand Down
4 changes: 4 additions & 0 deletions src/cart/crt_internal_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,10 @@ struct crt_context {
struct d_tm_node_t *cc_timedout_uri;
/** Total number of failed address resolution, of type counter */
struct d_tm_node_t *cc_failed_addr;
/** Counter for number of network glitches */
struct d_tm_node_t *cc_net_glitches;
/** Stats gauge of reported network delays */
struct d_tm_node_t *cc_net_delay;

/** Stores self uri for the current context */
char cc_self_uri[CRT_ADDR_STR_MAX_LEN];
Expand Down
26 changes: 25 additions & 1 deletion src/cart/crt_swim.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* (C) Copyright 2019-2023 Intel Corporation.
* (C) Copyright 2019-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1005,6 +1005,27 @@ static void crt_swim_update_last_unpack_hlc(struct crt_swim_membs *csm)
D_RWLOCK_UNLOCK(&crt_gdata.cg_rwlock);
}

static void
crt_metrics_sample_delay(crt_context_t crt_ctx, uint64_t delay, bool glitch)
{
struct crt_context *ctx;

if (unlikely(crt_ctx == CRT_CONTEXT_NULL)) {
D_ERROR("invalid parameter (NULL crt_ctx).\n");
return;
}

ctx = crt_ctx;

if (delay == 0)
d_tm_zero_gauge(ctx->cc_net_delay);
else
d_tm_set_gauge(ctx->cc_net_delay, delay);

if (glitch)
d_tm_inc_counter(ctx->cc_net_glitches, 1);
}

static int64_t crt_swim_progress_cb(crt_context_t crt_ctx, int64_t timeout_us, void *arg)
{
struct crt_grp_priv *grp_priv = crt_gdata.cg_grp->gg_primary_grp;
Expand Down Expand Up @@ -1053,12 +1074,15 @@ static int64_t crt_swim_progress_cb(crt_context_t crt_ctx, int64_t timeout_us, v
swim_net_glitch_update(csm->csm_ctx, self_id, delay);
csm->csm_last_unpack_hlc = hlc2;
}
crt_metrics_sample_delay(crt_ctx, delay, delay > max_delay);
}

if (now < ctx->sc_next_event)
timeout_us = min(timeout_us, (ctx->sc_next_event - now) * 1000);
} else if (rc) {
D_ERROR("swim_progress(): "DF_RC"\n", DP_RC(rc));
} else {
crt_metrics_sample_delay(crt_ctx, 0, false);
}

return timeout_us;
Expand Down
32 changes: 31 additions & 1 deletion src/gurt/telemetry.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2020-2023 Intel Corporation.
* (C) Copyright 2020-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1922,6 +1922,36 @@ has_stats(struct d_tm_node_t *metric)
metric->dtn_type == D_TM_STATS_GAUGE);
}

/**
* Set the gauge value to zero if not already zeroed.
* Avoids unnecessary stats churn.
*
* \param[in,out] metric Pointer to the metric
*/
void
d_tm_zero_gauge(struct d_tm_node_t *metric)
{
if (metric == NULL)
return;

if (!is_gauge(metric)) {
D_ERROR("Failed to zero gauge [%s] on item "
"not a gauge. Operation mismatch: " DF_RC "\n",
metric->dtn_name, DP_RC(-DER_OP_NOT_PERMITTED));
return;
}

d_tm_node_lock(metric);
if (metric->dtn_metric->dtm_data.value != 0) {
metric->dtn_metric->dtm_data.value = 0;
if (has_stats(metric)) {
d_tm_compute_stats(metric, metric->dtn_metric->dtm_data.value);
d_tm_compute_histogram(metric, 0);
}
}
d_tm_node_unlock(metric);
}

/**
* Set an arbitrary \a value for the gauge.
*
Expand Down
4 changes: 3 additions & 1 deletion src/include/gurt/telemetry_producer.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2020-2023 Intel Corporation.
* (C) Copyright 2020-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand All @@ -19,6 +19,8 @@ void d_tm_mark_duration_end(struct d_tm_node_t *metric);
void d_tm_set_gauge(struct d_tm_node_t *metric, uint64_t value);
void d_tm_inc_gauge(struct d_tm_node_t *metric, uint64_t value);
void d_tm_dec_gauge(struct d_tm_node_t *metric, uint64_t value);
void
d_tm_zero_gauge(struct d_tm_node_t *metric);

/* Other server functions */
int d_tm_init(int id, uint64_t mem_size, int flags);
Expand Down

0 comments on commit cce2084

Please sign in to comment.