Skip to content

Commit

Permalink
DAOS-16111 rebuild: enhance leader update_and_warn_for_slow_engines()
Browse files Browse the repository at this point in the history
With this change, the existing warning logic invoked by the PS leader
engine during rebuild_leader_status_check() is enhanced to check for
"almost done" and likely hung rebuilds. When 95% or more engines have
completed their scan or pull phase, this logic will set a deadline of
2 minutes. If the relevant rebuild phase has not finished by the
deadline, warning messages will be logged to indicate a potentially
stuck rebuild, including a list of engines that have not completed.

The determination of "almost done" is such that, depending on the
scale of the system, the number of remaining engines that are being
waited for is in a reasonable range of 1-20 engines.

The daos_test -r rebuild tests include a new rebuild_long_scan_hang()
to inject a single-engine scan hang with a > 2 minute delay,
to exercise the new warning logic.

Features: rebuild

Signed-off-by: Kenneth Cain <kenneth.cain@hpe.com>
  • Loading branch information
kccain committed Jan 23, 2025
1 parent a3d4d2f commit 6a8ff39
Showing 5 changed files with 183 additions and 45 deletions.
9 changes: 7 additions & 2 deletions src/rebuild/rebuild_internal.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2017-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
@@ -103,6 +104,7 @@ struct rebuild_tgt_pool_tracker {

struct rebuild_server_status {
d_rank_t rank;
double last_update;
uint32_t dtx_resync_version;
uint32_t scan_done:1,
pull_done:1;
@@ -122,8 +124,11 @@ struct rebuild_global_pool_tracker {

/** rebuild status for each server */
struct rebuild_server_status *rgt_servers;
double *rgt_servers_last_update;
double rgt_last_warn;

/** timestamps for the global operation */
double rgt_last_warn_ts; /* time of most recent warning log for "slow engines" */
double rgt_scan_warn_deadline_ts; /* time after which to warn that scan may be hung */
double rgt_pull_warn_deadline_ts; /* time after which to warn that pull may be hung */

/** indirect indices for binary search by rank */
struct rebuild_server_status **rgt_servers_sorted;
164 changes: 121 additions & 43 deletions src/rebuild/srv.c
Original file line number Diff line number Diff line change
@@ -160,6 +160,23 @@ is_rebuild_global_done(struct rebuild_global_pool_tracker *rgt)

}

/* determine if "most" engines are done with their current rebuild phase (scan or pull) */
static bool
is_rebuild_phase_mostly_done(int engines_done_ct, int engines_total_ct)
{
const int MIN_WAIT_CT = 1;
const int MAX_WAIT_CT = 20;
int wait_ct_threshold;
int engines_waiting_ct = engines_total_ct - engines_done_ct;

/* When is the global operation "mostly done" (e.g., waiting for 20 or fewer engines)? */
wait_ct_threshold = engines_done_ct - .95 * engines_total_ct;
wait_ct_threshold = max(MIN_WAIT_CT, wait_ct_threshold);
wait_ct_threshold = min(MAX_WAIT_CT, wait_ct_threshold);

return (engines_waiting_ct <= wait_ct_threshold);
}

#define SCAN_DONE 0x1
#define PULL_DONE 0x2

@@ -246,7 +263,7 @@ rebuild_leader_set_update_time(struct rebuild_global_pool_tracker *rgt, d_rank_t
i = daos_array_find(rgt->rgt_servers_sorted, rgt->rgt_servers_number, rank,
&servers_sort_ops);
if (i >= 0) {
rgt->rgt_servers_last_update[i] = ABT_get_wtime();
rgt->rgt_servers[i].last_update = ABT_get_wtime();
return;
}
D_INFO("rank %u is not included in this rebuild.\n", rank);
@@ -320,6 +337,101 @@ rpt_lookup(uuid_t pool_uuid, uint32_t opc, unsigned int ver, unsigned int gen)
return found;
}

static void
update_and_warn_for_slow_engines(struct rebuild_global_pool_tracker *rgt)
{
int i;
int scan_ct = 0;
int pull_ct = 0;
int done_ct;
int wait_ct;
bool scan_gl = false;
bool pull_gl = false;
bool do_warn = false;
double now = ABT_get_wtime();
double tw = now - rgt->rgt_last_warn_ts; /* time since last warning logged */
bool warned = false;

/* Throttle warnings to not more often than once per 2 minutes */
do_warn = (tw >= 120);

/* Count scan/pull progress and warn for any ranks that haven't provided updates recently */
for (i = 0; i < rgt->rgt_servers_number; i++) {
double tu = now - rgt->rgt_servers[i].last_update;
d_rank_t r = rgt->rgt_servers[i].rank;

if (rgt->rgt_servers[i].scan_done) {
scan_ct++;
if (rgt->rgt_servers[i].pull_done) {
pull_ct++;
continue;
}
}

if (!do_warn)
continue;

if (tu > 30) {
D_WARN(DF_RB ": no updates from rank %u in %8.3f seconds. "
"scan_done=%d pull_done=%d\n",
DP_RB_RGT(rgt), r, tu, rgt->rgt_servers[i].scan_done,
rgt->rgt_servers[i].pull_done);
warned = true;
}
}

scan_gl = (scan_ct == rgt->rgt_servers_number);
pull_gl = (pull_ct == rgt->rgt_servers_number);
if (scan_gl && pull_gl)
return;

/* Determine if scan/pull progress is almost done; mark the time and warn on possible stall.
*/
done_ct = scan_gl ? pull_ct : scan_ct;
wait_ct = rgt->rgt_servers_number - done_ct;
#if 0
D_DEBUG(DB_TRACE, DF_RB ": s_done=%s, s_ct=%d, p_done=%s, p_ct=%d, servers=%d, d_ct=%d, "
" almost=%s\n", DP_RB_RGT(rgt), scan_gl ? "yes" : "no", scan_ct, pull_gl ? "yes" : "no",
pull_ct, rgt->rgt_servers_number, done_ct,
is_rebuild_phase_mostly_done(done_ct, rgt->rgt_servers_number) ? "yes" : "no");
#endif
if (is_rebuild_phase_mostly_done(done_ct, rgt->rgt_servers_number)) {
if (!scan_gl && rgt->rgt_scan_warn_deadline_ts == 0.0) {
rgt->rgt_scan_warn_deadline_ts = now + 120.0;
D_DEBUG(DB_REBUILD, DF_RB ": scan almost done, %d/%d engines\n",
DP_RB_RGT(rgt), done_ct, rgt->rgt_servers_number);
} else if (!pull_gl && rgt->rgt_pull_warn_deadline_ts == 0.0) {
rgt->rgt_pull_warn_deadline_ts = now + 120.0;
D_DEBUG(DB_REBUILD, DF_RB ": pull almost done, %d/%d engines\n",
DP_RB_RGT(rgt), done_ct, rgt->rgt_servers_number);
}

if (!do_warn)
return;

if (!scan_gl && now > rgt->rgt_scan_warn_deadline_ts) {
D_WARN(DF_RB ": scan hung? waiting for %d/%d engines:\n",
DP_RB_RGT(rgt), wait_ct, rgt->rgt_servers_number);
for (i = 0; i < rgt->rgt_servers_number; i++)
if (!rgt->rgt_servers[i].scan_done)
D_WARN(DF_RB ": rank %u not finished scanning!\n", DP_RB_RGT(rgt),
rgt->rgt_servers[i].rank);
warned = true;
} else if (!pull_gl && now > rgt->rgt_pull_warn_deadline_ts) {
D_WARN(DF_RB ": pull hung? waiting for %d/%d engines:\n",
DP_RB_RGT(rgt), wait_ct, rgt->rgt_servers_number);
for (i = 0; i < rgt->rgt_servers_number; i++)
if (!rgt->rgt_servers[i].pull_done)
D_WARN(DF_RB ": rank %u not finished pulling!\n", DP_RB_RGT(rgt),
rgt->rgt_servers[i].rank);
warned = true;
}

if (warned)
rgt->rgt_last_warn_ts = now;
}
}

int
rebuild_global_status_update(struct rebuild_global_pool_tracker *rgt,
struct rebuild_iv *iv)
@@ -346,8 +458,10 @@ rebuild_global_status_update(struct rebuild_global_pool_tracker *rgt,
* both scan and pull status to the leader, i.e. they
* both can be trusted.
*/
if (iv->riv_status == 0)
if (iv->riv_status == 0) {
/* test only: update_and_warn_for_slow_engines(rgt); */
return 0;
}
}

/* Only trust pull done if scan is done globally */
@@ -358,6 +472,8 @@ rebuild_global_status_update(struct rebuild_global_pool_tracker *rgt,
rgt->rgt_rebuild_ver, iv->riv_rank);
}

/* test only: update_and_warn_for_slow_engines(rgt); */

return 0;
}

@@ -703,39 +819,6 @@ enum {
RB_BCAST_QUERY,
};

static void
warn_for_slow_engine_updates(struct rebuild_global_pool_tracker *rgt)
{
int i;
double now = ABT_get_wtime();
double tw = now - rgt->rgt_last_warn; /* time since last warning logged */
bool warned = false;

/* Throttle warnings to not more often than once per 2 minutes */
if (tw < 120)
return;

/* Warn for ranks not done and that haven't provided updates in a while (> 30 sec) */
for (i = 0; i < rgt->rgt_servers_number; i++) {
double tu = now - rgt->rgt_servers_last_update[i];
d_rank_t r = rgt->rgt_servers[i].rank;

if (rgt->rgt_servers[i].scan_done && rgt->rgt_servers[i].pull_done)
continue;

if (tu > 30) {
D_WARN(DF_RB ": no updates from rank %u in %8.3f seconds. "
"scan_done=%d pull_done=%d\n",
DP_RB_RGT(rgt), r, tu, rgt->rgt_servers[i].scan_done,
rgt->rgt_servers[i].pull_done);
warned = true;
}
}

if (warned)
rgt->rgt_last_warn = now;
}

/*
* Check rebuild status on the leader. Every other target sends
* its own rebuild status by IV.
@@ -865,7 +948,7 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op,
D_PRINT("%s", sbuf);
}
sleep:
warn_for_slow_engine_updates(rgt);
update_and_warn_for_slow_engines(rgt);
sched_req_sleep(rgt->rgt_ult, RBLD_CHECK_INTV);
}

@@ -882,8 +965,6 @@ rebuild_global_pool_tracker_destroy(struct rebuild_global_pool_tracker *rgt)
D_FREE(rgt->rgt_servers);
if (rgt->rgt_servers_sorted)
D_FREE(rgt->rgt_servers_sorted);
if (rgt->rgt_servers_last_update)
D_FREE(rgt->rgt_servers_last_update);

if (rgt->rgt_lock)
ABT_mutex_free(&rgt->rgt_lock);
@@ -921,16 +1002,13 @@ rebuild_global_pool_tracker_create(struct ds_pool *pool, uint32_t ver, uint32_t
D_ALLOC_ARRAY(rgt->rgt_servers_sorted, rank_nr);
if (rgt->rgt_servers_sorted == NULL)
D_GOTO(out, rc = -DER_NOMEM);
D_ALLOC_ARRAY(rgt->rgt_servers_last_update, rank_nr);
if (rgt->rgt_servers_last_update == NULL)
D_GOTO(out, rc = -DER_NOMEM);

now = ABT_get_wtime();
rgt->rgt_last_warn = now;
rgt->rgt_last_warn_ts = now;
for (i = 0; i < rank_nr; i++) {
rgt->rgt_servers_sorted[i] = &rgt->rgt_servers[i];
rgt->rgt_servers[i].rank = doms[i].do_comp.co_rank;
rgt->rgt_servers_last_update[i] = now;
rgt->rgt_servers[i].last_update = now;
}
rgt->rgt_servers_number = rank_nr;

17 changes: 17 additions & 0 deletions src/tests/ftest/daos_test/rebuild.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
'''
(C) Copyright 2018-2024 Intel Corporation.
(C) Copyright 2025 Hewlett Packard Enterprise Development LP
SPDX-License-Identifier: BSD-2-Clause-Patent
'''
@@ -365,3 +366,19 @@ def test_rebuild_35(self):
:avocado: tags=DaosCoreTestRebuild,daos_test,daos_core_test_rebuild,test_rebuild_35
"""
self.run_subtest()

def test_rebuild_36(self):
"""Jira ID: DAOS-16111
Test Description:
Run daos_test -r -s5 -u subtests=36
Use cases:
Core tests for daos_test rebuild
:avocado: tags=all,full_regression
:avocado: tags=hw,medium
:avocado: tags=unittest,rebuild
:avocado: tags=DaosCoreTestRebuild,daos_test,daos_core_test_rebuild,test_rebuild_36
"""
self.run_subtest()
4 changes: 4 additions & 0 deletions src/tests/ftest/daos_test/rebuild.yaml
Original file line number Diff line number Diff line change
@@ -13,6 +13,7 @@ timeouts:
test_rebuild_33: 200
test_rebuild_34: 200
test_rebuild_35: 180
test_rebuild_36: 250
pool:
nvme_size: 0G

@@ -81,6 +82,7 @@ daos_tests:
test_rebuild_33: DAOS_Rebuild_33
test_rebuild_34: DAOS_Rebuild_34
test_rebuild_35: DAOS_Rebuild_35
test_rebuild_36: DAOS_Rebuild_36
daos_test:
test_rebuild_0to10: r
test_rebuild_12to15: r
@@ -104,6 +106,7 @@ daos_tests:
test_rebuild_33: r
test_rebuild_34: r
test_rebuild_35: r
test_rebuild_36: r
args:
test_rebuild_0to10: -s3 -u subtests="0-10"
test_rebuild_12to15: -s3 -u subtests="12-15"
@@ -127,6 +130,7 @@ daos_tests:
test_rebuild_33: -s5 -u subtests="33"
test_rebuild_34: -s5 -u subtests="34"
test_rebuild_35: -s5 -u subtests="35"
test_rebuild_36: -s5 -u subtests="36"
stopped_ranks:
test_rebuild_26: ["random"]
test_rebuild_27: ["random"]
34 changes: 34 additions & 0 deletions src/tests/suite/daos_rebuild.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
@@ -1563,6 +1564,36 @@ rebuild_cont_destroy_and_reintegrate(void **state)
reintegrate_single_pool_rank(arg, 5, true);
}

static void
rebuild_long_scan_hang(void **state)
{
test_arg_t *arg = *state;
d_rank_t fault_rank = 1;
d_rank_t excl_rank = 0;
int rc;

if (!test_runable(arg, 7) || arg->pool.alive_svc->rl_nr < 5) {
print_message("need at least 5 svcs, -s5\n");
return;
}

print_message("inject DAOS_REBUILD_TGT_SCAN_HANG fault into engine rank %u\n", fault_rank);
test_set_engine_fail_loc(arg, fault_rank, DAOS_REBUILD_TGT_SCAN_HANG | DAOS_FAIL_ALWAYS);

print_message("exclude engine rank %u to cause rebuild, and delay > 2 minutes\n", excl_rank);
rc = dmg_pool_exclude(arg->dmg_config, arg->pool.pool_uuid, arg->group, excl_rank, -1);
assert_success(rc);

/* wait a long time, cause PS leader engine logic to warn about "almost done" rebuild
* see update_and_warn_for_slow_engines()
*/
sleep (130);
print_message("delay done, clear fault on engine rank %u and wait for rebuild\n", fault_rank);
test_set_engine_fail_loc(arg, fault_rank, 0);
test_rebuild_wait(&arg, 1);
print_message("rebuild done\n");
}

/** create a new pool/container for each test */
static const struct CMUnitTest rebuild_tests[] = {
{"REBUILD0: drop rebuild scan reply",
@@ -1655,6 +1686,9 @@ static const struct CMUnitTest rebuild_tests[] = {
{"REBUILD35: destroy container then reintegrate",
rebuild_cont_destroy_and_reintegrate, rebuild_sub_6nodes_rf1_setup,
rebuild_sub_teardown},
{"REBUILD36: single engine scan lengthy hang",
rebuild_long_scan_hang, rebuild_sub_setup,
rebuild_sub_teardown},
};

/* TODO: Enable aggregation once stable view rebuild is done. */

0 comments on commit 6a8ff39

Please sign in to comment.