From 65b244705d6e33c6dcf2586d88c486a77a3a9a67 Mon Sep 17 00:00:00 2001 From: Kenneth Cain Date: Tue, 7 Jan 2025 16:22:51 -0500 Subject: [PATCH 1/3] DAOS-16111 rebuild: enhance leader update_and_warn_for_slow_engines() With this change, the existing warning logic invoked by the PS leader engine during rebuild_leader_status_check() is enhanced to check for "almost done" and likely hung rebuilds. When 95% or more engines have completed their scan or pull phase, this logic will set a deadline of 2 minutes. If the relevant rebuild phase has not finished by the deadline, warning messages will be logged to indicate a potentially stuck rebuild, including a list of engines that have not completed. The determination of "almost done" is such that, depending on the scale of the system, the number of remaining engines that are being waited for is in a reasonable range of 1-20 engines. The daos_test -r rebuild tests include a new rebuild_long_scan_hang() to inject a single-engine scan hang with a > 2 minute delay, to exercise the new warning logic. Features: rebuild Signed-off-by: Kenneth Cain --- src/rebuild/rebuild_internal.h | 9 +- src/rebuild/srv.c | 164 +++++++++++++++------ src/tests/ftest/daos_test/rebuild.py | 17 +++ src/tests/ftest/daos_test/rebuild.yaml | 4 + src/tests/suite/daos_rebuild.c | 196 +++++++++++++------------ 5 files changed, 255 insertions(+), 135 deletions(-) diff --git a/src/rebuild/rebuild_internal.h b/src/rebuild/rebuild_internal.h index 28a8d6b747f..dda95847ecb 100644 --- a/src/rebuild/rebuild_internal.h +++ b/src/rebuild/rebuild_internal.h @@ -1,5 +1,6 @@ /** * (C) Copyright 2017-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -103,6 +104,7 @@ struct rebuild_tgt_pool_tracker { struct rebuild_server_status { d_rank_t rank; + double last_update; uint32_t dtx_resync_version; uint32_t scan_done:1, pull_done:1; @@ -122,8 +124,11 @@ struct rebuild_global_pool_tracker { /** rebuild status for each server */ struct rebuild_server_status *rgt_servers; - double *rgt_servers_last_update; - double rgt_last_warn; + + /** timestamps for the global operation */ + double rgt_last_warn_ts; /* time of most recent warning log for "slow engines" */ + double rgt_scan_warn_deadline_ts; /* time after which to warn that scan may be hung */ + double rgt_pull_warn_deadline_ts; /* time after which to warn that pull may be hung */ /** indirect indices for binary search by rank */ struct rebuild_server_status **rgt_servers_sorted; diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index c18486e35ec..d99b2075956 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -160,6 +160,23 @@ is_rebuild_global_done(struct rebuild_global_pool_tracker *rgt) } +/* determine if "most" engines are done with their current rebuild phase (scan or pull) */ +static bool +is_rebuild_phase_mostly_done(int engines_done_ct, int engines_total_ct) +{ + const int MIN_WAIT_CT = 1; + const int MAX_WAIT_CT = 20; + int wait_ct_threshold; + int engines_waiting_ct = engines_total_ct - engines_done_ct; + + /* When is the global operation "mostly done" (e.g., waiting for 20 or fewer engines)? */ + wait_ct_threshold = engines_done_ct - .95 * engines_total_ct; + wait_ct_threshold = max(MIN_WAIT_CT, wait_ct_threshold); + wait_ct_threshold = min(MAX_WAIT_CT, wait_ct_threshold); + + return (engines_waiting_ct <= wait_ct_threshold); +} + #define SCAN_DONE 0x1 #define PULL_DONE 0x2 @@ -246,7 +263,7 @@ rebuild_leader_set_update_time(struct rebuild_global_pool_tracker *rgt, d_rank_t i = daos_array_find(rgt->rgt_servers_sorted, rgt->rgt_servers_number, rank, &servers_sort_ops); if (i >= 0) { - rgt->rgt_servers_last_update[i] = ABT_get_wtime(); + rgt->rgt_servers[i].last_update = ABT_get_wtime(); return; } D_INFO("rank %u is not included in this rebuild.\n", rank); @@ -320,6 +337,101 @@ rpt_lookup(uuid_t pool_uuid, uint32_t opc, unsigned int ver, unsigned int gen) return found; } +static void +update_and_warn_for_slow_engines(struct rebuild_global_pool_tracker *rgt) +{ + int i; + int scan_ct = 0; + int pull_ct = 0; + int done_ct; + int wait_ct; + bool scan_gl = false; + bool pull_gl = false; + bool do_warn = false; + double now = ABT_get_wtime(); + double tw = now - rgt->rgt_last_warn_ts; /* time since last warning logged */ + bool warned = false; + + /* Throttle warnings to not more often than once per 2 minutes */ + do_warn = (tw >= 120); + + /* Count scan/pull progress and warn for any ranks that haven't provided updates recently */ + for (i = 0; i < rgt->rgt_servers_number; i++) { + double tu = now - rgt->rgt_servers[i].last_update; + d_rank_t r = rgt->rgt_servers[i].rank; + + if (rgt->rgt_servers[i].scan_done) { + scan_ct++; + if (rgt->rgt_servers[i].pull_done) { + pull_ct++; + continue; + } + } + + if (!do_warn) + continue; + + if (tu > 30) { + D_WARN(DF_RB ": no updates from rank %u in %8.3f seconds. " + "scan_done=%d pull_done=%d\n", + DP_RB_RGT(rgt), r, tu, rgt->rgt_servers[i].scan_done, + rgt->rgt_servers[i].pull_done); + warned = true; + } + } + + scan_gl = (scan_ct == rgt->rgt_servers_number); + pull_gl = (pull_ct == rgt->rgt_servers_number); + if (scan_gl && pull_gl) + return; + + /* Determine if scan/pull progress is almost done; mark the time and warn on possible stall. + */ + done_ct = scan_gl ? pull_ct : scan_ct; + wait_ct = rgt->rgt_servers_number - done_ct; +#if 0 + D_DEBUG(DB_TRACE, DF_RB ": s_done=%s, s_ct=%d, p_done=%s, p_ct=%d, servers=%d, d_ct=%d, " + " almost=%s\n", DP_RB_RGT(rgt), scan_gl ? "yes" : "no", scan_ct, pull_gl ? "yes" : "no", + pull_ct, rgt->rgt_servers_number, done_ct, + is_rebuild_phase_mostly_done(done_ct, rgt->rgt_servers_number) ? "yes" : "no"); +#endif + if (is_rebuild_phase_mostly_done(done_ct, rgt->rgt_servers_number)) { + if (!scan_gl && rgt->rgt_scan_warn_deadline_ts == 0.0) { + rgt->rgt_scan_warn_deadline_ts = now + 120.0; + D_DEBUG(DB_REBUILD, DF_RB ": scan almost done, %d/%d engines\n", + DP_RB_RGT(rgt), done_ct, rgt->rgt_servers_number); + } else if (!pull_gl && rgt->rgt_pull_warn_deadline_ts == 0.0) { + rgt->rgt_pull_warn_deadline_ts = now + 120.0; + D_DEBUG(DB_REBUILD, DF_RB ": pull almost done, %d/%d engines\n", + DP_RB_RGT(rgt), done_ct, rgt->rgt_servers_number); + } + + if (!do_warn) + return; + + if (!scan_gl && now > rgt->rgt_scan_warn_deadline_ts) { + D_WARN(DF_RB ": scan hung? waiting for %d/%d engines:\n", DP_RB_RGT(rgt), + wait_ct, rgt->rgt_servers_number); + for (i = 0; i < rgt->rgt_servers_number; i++) + if (!rgt->rgt_servers[i].scan_done) + D_WARN(DF_RB ": rank %u not finished scanning!\n", + DP_RB_RGT(rgt), rgt->rgt_servers[i].rank); + warned = true; + } else if (!pull_gl && now > rgt->rgt_pull_warn_deadline_ts) { + D_WARN(DF_RB ": pull hung? waiting for %d/%d engines:\n", DP_RB_RGT(rgt), + wait_ct, rgt->rgt_servers_number); + for (i = 0; i < rgt->rgt_servers_number; i++) + if (!rgt->rgt_servers[i].pull_done) + D_WARN(DF_RB ": rank %u not finished pulling!\n", + DP_RB_RGT(rgt), rgt->rgt_servers[i].rank); + warned = true; + } + + if (warned) + rgt->rgt_last_warn_ts = now; + } +} + int rebuild_global_status_update(struct rebuild_global_pool_tracker *rgt, struct rebuild_iv *iv) @@ -346,8 +458,10 @@ rebuild_global_status_update(struct rebuild_global_pool_tracker *rgt, * both scan and pull status to the leader, i.e. they * both can be trusted. */ - if (iv->riv_status == 0) + if (iv->riv_status == 0) { + /* test only: update_and_warn_for_slow_engines(rgt); */ return 0; + } } /* Only trust pull done if scan is done globally */ @@ -358,6 +472,8 @@ rebuild_global_status_update(struct rebuild_global_pool_tracker *rgt, rgt->rgt_rebuild_ver, iv->riv_rank); } + /* test only: update_and_warn_for_slow_engines(rgt); */ + return 0; } @@ -703,39 +819,6 @@ enum { RB_BCAST_QUERY, }; -static void -warn_for_slow_engine_updates(struct rebuild_global_pool_tracker *rgt) -{ - int i; - double now = ABT_get_wtime(); - double tw = now - rgt->rgt_last_warn; /* time since last warning logged */ - bool warned = false; - - /* Throttle warnings to not more often than once per 2 minutes */ - if (tw < 120) - return; - - /* Warn for ranks not done and that haven't provided updates in a while (> 30 sec) */ - for (i = 0; i < rgt->rgt_servers_number; i++) { - double tu = now - rgt->rgt_servers_last_update[i]; - d_rank_t r = rgt->rgt_servers[i].rank; - - if (rgt->rgt_servers[i].scan_done && rgt->rgt_servers[i].pull_done) - continue; - - if (tu > 30) { - D_WARN(DF_RB ": no updates from rank %u in %8.3f seconds. " - "scan_done=%d pull_done=%d\n", - DP_RB_RGT(rgt), r, tu, rgt->rgt_servers[i].scan_done, - rgt->rgt_servers[i].pull_done); - warned = true; - } - } - - if (warned) - rgt->rgt_last_warn = now; -} - /* * Check rebuild status on the leader. Every other target sends * its own rebuild status by IV. @@ -865,7 +948,7 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op, D_PRINT("%s", sbuf); } sleep: - warn_for_slow_engine_updates(rgt); + update_and_warn_for_slow_engines(rgt); sched_req_sleep(rgt->rgt_ult, RBLD_CHECK_INTV); } @@ -882,8 +965,6 @@ rebuild_global_pool_tracker_destroy(struct rebuild_global_pool_tracker *rgt) D_FREE(rgt->rgt_servers); if (rgt->rgt_servers_sorted) D_FREE(rgt->rgt_servers_sorted); - if (rgt->rgt_servers_last_update) - D_FREE(rgt->rgt_servers_last_update); if (rgt->rgt_lock) ABT_mutex_free(&rgt->rgt_lock); @@ -921,16 +1002,13 @@ rebuild_global_pool_tracker_create(struct ds_pool *pool, uint32_t ver, uint32_t D_ALLOC_ARRAY(rgt->rgt_servers_sorted, rank_nr); if (rgt->rgt_servers_sorted == NULL) D_GOTO(out, rc = -DER_NOMEM); - D_ALLOC_ARRAY(rgt->rgt_servers_last_update, rank_nr); - if (rgt->rgt_servers_last_update == NULL) - D_GOTO(out, rc = -DER_NOMEM); now = ABT_get_wtime(); - rgt->rgt_last_warn = now; + rgt->rgt_last_warn_ts = now; for (i = 0; i < rank_nr; i++) { rgt->rgt_servers_sorted[i] = &rgt->rgt_servers[i]; rgt->rgt_servers[i].rank = doms[i].do_comp.co_rank; - rgt->rgt_servers_last_update[i] = now; + rgt->rgt_servers[i].last_update = now; } rgt->rgt_servers_number = rank_nr; diff --git a/src/tests/ftest/daos_test/rebuild.py b/src/tests/ftest/daos_test/rebuild.py index 62c66b7da21..012b157d42c 100644 --- a/src/tests/ftest/daos_test/rebuild.py +++ b/src/tests/ftest/daos_test/rebuild.py @@ -1,5 +1,6 @@ ''' (C) Copyright 2018-2024 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -365,3 +366,19 @@ def test_rebuild_35(self): :avocado: tags=DaosCoreTestRebuild,daos_test,daos_core_test_rebuild,test_rebuild_35 """ self.run_subtest() + + def test_rebuild_36(self): + """Jira ID: DAOS-16111 + + Test Description: + Run daos_test -r -s5 -u subtests=36 + + Use cases: + Core tests for daos_test rebuild + + :avocado: tags=all,full_regression + :avocado: tags=hw,medium + :avocado: tags=unittest,rebuild + :avocado: tags=DaosCoreTestRebuild,daos_test,daos_core_test_rebuild,test_rebuild_36 + """ + self.run_subtest() diff --git a/src/tests/ftest/daos_test/rebuild.yaml b/src/tests/ftest/daos_test/rebuild.yaml index 9ceb3c02afb..0cac7bcffb4 100644 --- a/src/tests/ftest/daos_test/rebuild.yaml +++ b/src/tests/ftest/daos_test/rebuild.yaml @@ -13,6 +13,7 @@ timeouts: test_rebuild_33: 200 test_rebuild_34: 200 test_rebuild_35: 180 + test_rebuild_36: 250 pool: nvme_size: 0G @@ -81,6 +82,7 @@ daos_tests: test_rebuild_33: DAOS_Rebuild_33 test_rebuild_34: DAOS_Rebuild_34 test_rebuild_35: DAOS_Rebuild_35 + test_rebuild_36: DAOS_Rebuild_36 daos_test: test_rebuild_0to10: r test_rebuild_12to15: r @@ -104,6 +106,7 @@ daos_tests: test_rebuild_33: r test_rebuild_34: r test_rebuild_35: r + test_rebuild_36: r args: test_rebuild_0to10: -s3 -u subtests="0-10" test_rebuild_12to15: -s3 -u subtests="12-15" @@ -127,6 +130,7 @@ daos_tests: test_rebuild_33: -s5 -u subtests="33" test_rebuild_34: -s5 -u subtests="34" test_rebuild_35: -s5 -u subtests="35" + test_rebuild_36: -s5 -u subtests="36" stopped_ranks: test_rebuild_26: ["random"] test_rebuild_27: ["random"] diff --git a/src/tests/suite/daos_rebuild.c b/src/tests/suite/daos_rebuild.c index 20a751504ac..e76c02cf56f 100644 --- a/src/tests/suite/daos_rebuild.c +++ b/src/tests/suite/daos_rebuild.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1563,98 +1564,113 @@ rebuild_cont_destroy_and_reintegrate(void **state) reintegrate_single_pool_rank(arg, 5, true); } +static void +rebuild_long_scan_hang(void **state) +{ + test_arg_t *arg = *state; + d_rank_t fault_rank = 1; + d_rank_t excl_rank = 0; + int rc; + + if (!test_runable(arg, 7) || arg->pool.alive_svc->rl_nr < 5) { + print_message("need at least 5 svcs, -s5\n"); + return; + } + + print_message("inject DAOS_REBUILD_TGT_SCAN_HANG fault into engine rank %u\n", fault_rank); + test_set_engine_fail_loc(arg, fault_rank, DAOS_REBUILD_TGT_SCAN_HANG | DAOS_FAIL_ALWAYS); + + print_message("exclude engine rank %u to cause rebuild, and delay > 2 minutes\n", + excl_rank); + rc = dmg_pool_exclude(arg->dmg_config, arg->pool.pool_uuid, arg->group, excl_rank, -1); + assert_success(rc); + + /* wait a long time, cause PS leader engine logic to warn about "almost done" rebuild + * see update_and_warn_for_slow_engines() + */ + sleep(130); + print_message("delay done, clear fault on engine rank %u and wait for rebuild\n", + fault_rank); + test_set_engine_fail_loc(arg, fault_rank, 0); + test_rebuild_wait(&arg, 1); + print_message("rebuild done\n"); +} + /** create a new pool/container for each test */ static const struct CMUnitTest rebuild_tests[] = { - {"REBUILD0: drop rebuild scan reply", - rebuild_drop_scan, rebuild_small_sub_setup, rebuild_sub_teardown}, - {"REBUILD1: retry rebuild for not ready", - rebuild_retry_rebuild, rebuild_small_sub_setup, rebuild_sub_teardown}, - {"REBUILD2: drop rebuild obj reply", - rebuild_drop_obj, rebuild_small_sub_setup, rebuild_sub_teardown}, - {"REBUILD3: rebuild multiple pools", - rebuild_multiple_pools, rebuild_sub_setup, rebuild_sub_teardown}, - {"REBUILD4: rebuild update failed", - rebuild_update_failed, rebuild_small_sub_setup, rebuild_sub_teardown}, - {"REBUILD5: retry rebuild for pool stale", - rebuild_retry_for_stale_pool, rebuild_small_sub_setup, - rebuild_sub_teardown}, - {"REBUILD6: rebuild with container destroy", - rebuild_destroy_container, rebuild_sub_setup, rebuild_sub_teardown}, - {"REBUILD7: rebuild with container close", - rebuild_close_container, rebuild_small_sub_setup, rebuild_sub_teardown}, - {"REBUILD8: rebuild with pool destroy during scan", - rebuild_destroy_pool_during_scan, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD9: rebuild with pool destroy during rebuild", - rebuild_destroy_pool_during_rebuild, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD10: rebuild iv tgt fail", - rebuild_iv_tgt_fail, rebuild_small_sub_setup, rebuild_sub_teardown}, - {"REBUILD11: rebuild tgt start fail", - rebuild_tgt_start_fail, rebuild_small_sub_setup, rebuild_sub_teardown}, - {"REBUILD12: rebuild send objects failed", - rebuild_send_objects_fail, rebuild_small_sub_setup, - rebuild_sub_teardown}, - {"REBUILD13: rebuild empty pool offline", - rebuild_offline_empty, rebuild_small_sub_setup, rebuild_sub_teardown}, - {"REBUILD14: rebuild no space failure", - rebuild_nospace, rebuild_small_sub_setup, rebuild_sub_teardown}, - {"REBUILD15: rebuild multiple tgts", - rebuild_multiple_tgts, rebuild_small_sub_setup, rebuild_sub_teardown}, - {"REBUILD16: disconnect pool during scan", - rebuild_tgt_pool_disconnect_in_scan, rebuild_small_sub_setup, - rebuild_sub_teardown}, - {"REBUILD17: disconnect pool during rebuild", - rebuild_tgt_pool_disconnect_in_rebuild, rebuild_small_sub_setup, - rebuild_sub_teardown}, - {"REBUILD18: multi-pools rebuild concurrently", - multi_pools_rebuild_concurrently, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD19: rebuild with master change during scan", - rebuild_master_change_during_scan, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD20: rebuild with master change during rebuild", - rebuild_master_change_during_rebuild, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD21: rebuild with master failure", - rebuild_master_failure, rebuild_sub_setup, rebuild_sub_teardown}, - {"REBUILD22: connect pool during scan for offline rebuild", - rebuild_offline_pool_connect_in_scan, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD23: connect pool during rebuild for offline rebuild", - rebuild_offline_pool_connect_in_rebuild, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD24: offline rebuild", - rebuild_offline, rebuild_sub_setup, rebuild_sub_teardown}, - {"REBUILD25: rebuild with two failures", - rebuild_multiple_failures, rebuild_sub_setup, rebuild_sub_teardown}, - {"REBUILD26: rebuild fail all replicas before rebuild", - rebuild_fail_all_replicas_before_rebuild, rebuild_sub_rf1_setup, - rebuild_sub_teardown}, - {"REBUILD27: rebuild fail all replicas", - rebuild_fail_all_replicas, rebuild_sub_setup, rebuild_sub_teardown}, - {"REBUILD28: rebuild kill rank during rebuild", - rebuild_kill_rank_during_rebuild, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD29: rebuild kill PS leader during rebuild", - rebuild_kill_PS_leader_during_rebuild, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD30: destroy pool during rebuild failure and retry", - rebuild_pool_destroy_during_rebuild_failure, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD31: reintegrate failure and retry", - reintegrate_failure_and_retry, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD32: kill more ranks than RF, then reintegrate", - rebuild_kill_more_RF_ranks, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD33: delay rebuild and extend", - rebuild_delay_and_reintegrate, rebuild_sub_setup, rebuild_sub_teardown}, - {"REBUILD34: delay rebuild and extend", - rebuild_delay_and_extend, rebuild_sub_6nodes_rf1_setup, rebuild_sub_teardown}, - {"REBUILD35: destroy container then reintegrate", - rebuild_cont_destroy_and_reintegrate, rebuild_sub_6nodes_rf1_setup, - rebuild_sub_teardown}, + {"REBUILD0: drop rebuild scan reply", rebuild_drop_scan, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD1: retry rebuild for not ready", rebuild_retry_rebuild, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD2: drop rebuild obj reply", rebuild_drop_obj, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD3: rebuild multiple pools", rebuild_multiple_pools, rebuild_sub_setup, + rebuild_sub_teardown}, + {"REBUILD4: rebuild update failed", rebuild_update_failed, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD5: retry rebuild for pool stale", rebuild_retry_for_stale_pool, + rebuild_small_sub_setup, rebuild_sub_teardown}, + {"REBUILD6: rebuild with container destroy", rebuild_destroy_container, rebuild_sub_setup, + rebuild_sub_teardown}, + {"REBUILD7: rebuild with container close", rebuild_close_container, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD8: rebuild with pool destroy during scan", rebuild_destroy_pool_during_scan, + rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD9: rebuild with pool destroy during rebuild", rebuild_destroy_pool_during_rebuild, + rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD10: rebuild iv tgt fail", rebuild_iv_tgt_fail, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD11: rebuild tgt start fail", rebuild_tgt_start_fail, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD12: rebuild send objects failed", rebuild_send_objects_fail, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD13: rebuild empty pool offline", rebuild_offline_empty, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD14: rebuild no space failure", rebuild_nospace, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD15: rebuild multiple tgts", rebuild_multiple_tgts, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD16: disconnect pool during scan", rebuild_tgt_pool_disconnect_in_scan, + rebuild_small_sub_setup, rebuild_sub_teardown}, + {"REBUILD17: disconnect pool during rebuild", rebuild_tgt_pool_disconnect_in_rebuild, + rebuild_small_sub_setup, rebuild_sub_teardown}, + {"REBUILD18: multi-pools rebuild concurrently", multi_pools_rebuild_concurrently, + rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD19: rebuild with master change during scan", rebuild_master_change_during_scan, + rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD20: rebuild with master change during rebuild", rebuild_master_change_during_rebuild, + rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD21: rebuild with master failure", rebuild_master_failure, rebuild_sub_setup, + rebuild_sub_teardown}, + {"REBUILD22: connect pool during scan for offline rebuild", + rebuild_offline_pool_connect_in_scan, rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD23: connect pool during rebuild for offline rebuild", + rebuild_offline_pool_connect_in_rebuild, rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD24: offline rebuild", rebuild_offline, rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD25: rebuild with two failures", rebuild_multiple_failures, rebuild_sub_setup, + rebuild_sub_teardown}, + {"REBUILD26: rebuild fail all replicas before rebuild", + rebuild_fail_all_replicas_before_rebuild, rebuild_sub_rf1_setup, rebuild_sub_teardown}, + {"REBUILD27: rebuild fail all replicas", rebuild_fail_all_replicas, rebuild_sub_setup, + rebuild_sub_teardown}, + {"REBUILD28: rebuild kill rank during rebuild", rebuild_kill_rank_during_rebuild, + rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD29: rebuild kill PS leader during rebuild", rebuild_kill_PS_leader_during_rebuild, + rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD30: destroy pool during rebuild failure and retry", + rebuild_pool_destroy_during_rebuild_failure, rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD31: reintegrate failure and retry", reintegrate_failure_and_retry, rebuild_sub_setup, + rebuild_sub_teardown}, + {"REBUILD32: kill more ranks than RF, then reintegrate", rebuild_kill_more_RF_ranks, + rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD33: delay rebuild and extend", rebuild_delay_and_reintegrate, rebuild_sub_setup, + rebuild_sub_teardown}, + {"REBUILD34: delay rebuild and extend", rebuild_delay_and_extend, rebuild_sub_6nodes_rf1_setup, + rebuild_sub_teardown}, + {"REBUILD35: destroy container then reintegrate", rebuild_cont_destroy_and_reintegrate, + rebuild_sub_6nodes_rf1_setup, rebuild_sub_teardown}, + {"REBUILD36: single engine scan lengthy hang", rebuild_long_scan_hang, rebuild_sub_setup, + rebuild_sub_teardown}, }; /* TODO: Enable aggregation once stable view rebuild is done. */ From a6c2c4a2b08b077cdffcb7fa9e3cb19418ee6f9f Mon Sep 17 00:00:00 2001 From: Kenneth Cain Date: Mon, 27 Jan 2025 21:29:53 -0500 Subject: [PATCH 2/3] on IV update with nonzero RC, warn in rebuild_global_status_update() daos_sys_logscan.py - detect update_and_warn_for_slow_engines() WARN daos_sys_logscan.py - detect "failed" rebuild, print rc, and fail_rank Features: rebuild Signed-off-by: Kenneth Cain --- src/rebuild/srv.c | 20 +- src/tests/ftest/cart/util/daos_sys_logscan.py | 236 ++++++++++++------ 2 files changed, 169 insertions(+), 87 deletions(-) diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index d99b2075956..03c2fa22b75 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -438,9 +438,9 @@ rebuild_global_status_update(struct rebuild_global_pool_tracker *rgt, { rebuild_leader_set_update_time(rgt, iv->riv_rank); - D_DEBUG(DB_REBUILD, "iv rank %d scan_done %d pull_done %d resync dtx %u\n", - iv->riv_rank, iv->riv_scan_done, iv->riv_pull_done, - iv->riv_dtx_resyc_version); + D_DEBUG(DB_REBUILD, DF_RB ": iv rank %d scan_done %d pull_done %d resync dtx %u\n", + DP_RB_RGT(rgt), iv->riv_rank, iv->riv_scan_done, iv->riv_pull_done, + iv->riv_dtx_resyc_version); if (!iv->riv_scan_done) { rebuild_leader_set_status(rgt, iv->riv_rank, iv->riv_dtx_resyc_version, 0); @@ -450,8 +450,7 @@ rebuild_global_status_update(struct rebuild_global_pool_tracker *rgt, if (!is_rebuild_global_scan_done(rgt)) { rebuild_leader_set_status(rgt, iv->riv_rank, iv->riv_dtx_resyc_version, SCAN_DONE); - D_DEBUG(DB_REBUILD, "rebuild ver %d tgt %d scan done\n", - rgt->rgt_rebuild_ver, iv->riv_rank); + D_DEBUG(DB_REBUILD, DF_RB ": rank %d scan done\n", DP_RB_RGT(rgt), iv->riv_rank); /* If global scan is not done, then you can not trust * pull status. But if the rebuild on that target is * failed(riv_status != 0), then the target will report @@ -464,12 +463,13 @@ rebuild_global_status_update(struct rebuild_global_pool_tracker *rgt, } } - /* Only trust pull done if scan is done globally */ + /* Only trust pull done if scan errored or is done globally */ if (iv->riv_pull_done) { - rebuild_leader_set_status(rgt, iv->riv_rank, iv->riv_dtx_resyc_version, - PULL_DONE); - D_DEBUG(DB_REBUILD, "rebuild ver %d tgt %d pull done\n", - rgt->rgt_rebuild_ver, iv->riv_rank); + rebuild_leader_set_status(rgt, iv->riv_rank, iv->riv_dtx_resyc_version, PULL_DONE); + D_DEBUG(DB_REBUILD, DF_RB ": rank %d pull done\n", DP_RB_RGT(rgt), iv->riv_rank); + if (iv->riv_status != 0) + DL_WARN(iv->riv_status, DF_RB ": rank %u update with failure\n", DP_RB_RGT(rgt), + iv->riv_rank); } /* test only: update_and_warn_for_slow_engines(rgt); */ diff --git a/src/tests/ftest/cart/util/daos_sys_logscan.py b/src/tests/ftest/cart/util/daos_sys_logscan.py index 9b5b5e6e78c..be3af7b7066 100755 --- a/src/tests/ftest/cart/util/daos_sys_logscan.py +++ b/src/tests/ftest/cart/util/daos_sys_logscan.py @@ -17,7 +17,6 @@ class SysPools(): """Directory of Pools and Summary Activities Found in Engine Log Files""" # Future possibilities include: - # diagram of nested dictionaries constructed in the comments # system map update events (output outside if pool-specific context) # SWIM events seen by PS leader? # add/remove target events on PS leader? @@ -51,11 +50,18 @@ class SysPools(): # re.compile(r"rebuild_task_ult\(\).*map_dist_ver (\d+) map ver (\d+)") # Rebuild: PS leader engine starting and status checking a given operation - # statuses: "scanning", "pulling", "completed", "aborted" + # statuses: "scanning", "pulling", "completed", "aborted", "failed" ldr_start_re = "rebuild_leader_start.*" + rbid_re + "$" - ldr_status_re = r"rebuild_leader_status_check\(\).*" + rbid_re + r" \[(\w+)\].*duration=(\d+)" + ldr_status_re = r"rebuild_leader_status_check\(\).*" + rbid_re + r" \[(\w+)\]" + \ + r".*status (-?\d+)/(\d+) .*duration=(\d+)" + ldr_scan_hung_re = r"update_and_warn_for_slow_engines\(\).*" + rbid_re + \ + r".*scan hung.*waiting for (\d+)/(\d+) engines" + ldr_pull_hung_re = r"update_and_warn_for_slow_engines\(\).*" + rbid_re + \ + r".*pull hung.*waiting for (\d+)/(\d+) engines" re_rebuild_ldr_start = re.compile(ldr_start_re) re_rebuild_ldr_status = re.compile(ldr_status_re) + re_rebuild_ldr_scan_hung = re.compile(ldr_scan_hung_re) + re_rebuild_ldr_pull_hung = re.compile(ldr_pull_hung_re) # Legacy rebuild PS leader logging (before uniform rebuild string) old_ldr_start_re = r"rebuild_leader_start.*([0-9a-fA-F]{8}).*version=(\d+)/(\d+).*op=(\w+)" @@ -80,6 +86,55 @@ def __init__(self): self._check_rb_legacy_fmt = True self._debug = False + # other nested dictionaries within self._pools will be built-up + # pool leadership terms dictionary indexed by integer term number + # self._pools[puuid][term] -> {rank, begin_time, end_time, host, pid, logfile, maps={}} + # + # pool map versions dictionary indexed by integer pool map version + # self._pools[puuid][term]["maps"][mapver] = {carryover, from_ver, time, rb_gens={}} + # + # rebuild generations dictionary indexed by integer rebuild generation number + # contains: rebuild operation, start time, duration, + # status(started/completed/aborted/failed/scanning/pulling), + # and if any scan or pull hang warnings were logged by PS leader + # self._pools[puuid][term]["maps"][mapver]["rb_gens"][gen] = + # {op, start_time, time, started, scanning, scan_hung, scan_hung_time, scan_num_eng_wait, + # pulling, pull_hung, pull_hung_time, pull_num_eng_wait, completed, aborted, failed, dur) + + def _create_mapver(self, ver, carryover=False, tm="xx/xx-xx:xx:xx.xx"): + return { + "carryover": carryover, + "from_ver": ver, + "time": tm, + "rb_gens": {} + } + + def _create_rbgen(self, op, start_time="xx/xx-xx:xx:xx.xx", tm="xx/xx-xx:xx:xx.xx", + started=True, scanning=False, scan_hung=False, + scan_hung_time="xx/xx-xx:xx:xx.xx", scan_num_eng_wait=0, pulling=False, + pull_hung=False, pull_hung_time="xx/xx-xx:xx:xx.xx", pull_num_eng_wait=0, + completed=False, aborted=False, failed=False, fail_rank=-1, rc=0, duration=0): + return { + "op": op, + "start_time": start_time, + "time": tm, + "started": started, + "scanning": scanning, + "scan_hung": scan_hung, + "scan_hung_time": scan_hung_time, + "scan_num_eng_wait": scan_num_eng_wait, + "pulling": pulling, + "pull_hung": pull_hung, + "pull_hung_time": pull_hung_time, + "pull_num_eng_wait": pull_num_eng_wait, + "completed": completed, + "aborted": aborted, + "failed": failed, + "fail_rank": fail_rank, + "rc": rc, + "duration": duration + } + def _warn(self, wmsg, fname, line=None): full_msg = f"WARN file={fname}" if line: @@ -140,13 +195,12 @@ def _match_ps_step_up(self, fname, line, pid, rank): if term in self._pools[puuid]: self._warn(f"pool {puuid} term {term} already seen!", fname, line) # carry over most recent map version into the new term, avoid later KeyError - if old_term in self._pools: + pmap_versions = {} + if old_term in self._pools[puuid]: if self._pools and self._pools[puuid][old_term]["maps"] != {}: last_mapver = max(self._pools[puuid][old_term]["maps"].keys()) - pmap_versions = self._pools[puuid][old_term]["maps"][last_mapver] - pmap_versions["carryover"] = True - else: - pmap_versions = {} + pmap_versions[last_mapver] = self._pools[puuid][old_term]["maps"][last_mapver] + pmap_versions[last_mapver]["carryover"] = True self._pools[puuid][term] = { "rank": rank, "begin_time": datetime, @@ -195,12 +249,8 @@ def _match_ps_pmap_update(self, fname, line, pid, rank): if not self._is_leader(puuid, rank, pid): return True term = self._cur_term[puuid] - self._pools[puuid][term]["maps"][to_ver] = { - "carryover": False, - "from_ver": from_ver, - "time": datetime, - "rb_gens": {} - } + self._pools[puuid][term]["maps"][to_ver] = \ + self._create_mapver(from_ver, carryover=False, tm=datetime) if self._debug: print(f"FOUND pool {puuid} map update {from_ver}->{to_ver} rank {rank}\t{host}\t" f"PID {pid}\t{fname}") @@ -233,17 +283,8 @@ def _match_ps_rb_start(self, fname, line, pid, rank): if gen in self._pools[puuid][term]["maps"][ver]["rb_gens"]: self._warn(f"pool {puuid} term {term} ver {ver} already has gen {gen}", fname, line) # Future possibility: keep timestamps, durations for scan start, pull start, completed - self._pools[puuid][term]["maps"][ver]["rb_gens"][gen] = { - "op": op, - "start_time": datetime, - "time": "xx/xx-xx:xx:xx.xx", - "started": True, - "scanning": False, - "pulling": False, - "completed": False, - "aborted": False, - "duration": 0 - } + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen] = \ + self._create_rbgen(op, start_time=datetime) if self._debug: print(f"{datetime} FOUND rebuild start in term {term}, rb={puuid}/{ver}/{gen}/{op} " f"rank {rank}\t{host}\tPID {pid}\t{fname}") @@ -269,26 +310,19 @@ def _match_legacy_ps_rb_start(self, fname, line, pid, rank): return True if gen in self._pools[puuid][term]["maps"][ver]["rb_gens"]: self._warn(f"pool {puuid} term {term} ver {ver} already has gen {gen}", fname, line) - self._pools[puuid][term]["maps"][ver]["rb_gens"][gen] = { - "op": op, - "start_time": datetime, - "time": "xx/xx-xx:xx:xx.xx", - "started": True, - "scanning": False, - "pulling": False, - "completed": False, - "aborted": False, - "duration": 0 - } + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen] = \ + self._create_rbgen(op, start_time=datetime) + if self._debug: print(f"{datetime} FOUND rebuild start in term {term}, rb={puuid}/{ver}/{gen}/{op} " f"rank {rank}\t{host}\tPID {pid}\t{fname}") return True def _get_ps_rb_status_components(self, match): - # puuid, map version, rebuild-generation, operation, status, duration + # puuid, map version, rebuild-generation, operation, status, rc, fail_rank, duration # see re_rebuild_ldr_status - return self._get_rb_components(match) + (match.group(5), int(match.group(6))) + return self._get_rb_components(match) + (match.group(5), int(match.group(6)), + int(match.group(7)), int(match.group(8))) def _match_ps_rb_status(self, fname, line, pid, rank): # Do not match on new rebuild log format if we found legacy format @@ -301,7 +335,7 @@ def _match_ps_rb_status(self, fname, line, pid, rank): # Disable checking for legacy rebuild log format, to save execution time self._check_rb_legacy_fmt = False - puuid, ver, gen, op, status, dur = self._get_ps_rb_status_components(match) + puuid, ver, gen, op, status, rc, fail_rank, dur = self._get_ps_rb_status_components(match) if not self._is_leader(puuid, rank, pid): return True term = self._cur_term[puuid] @@ -311,23 +345,9 @@ def _match_ps_rb_status(self, fname, line, pid, rank): if ver not in self._pools[puuid][term]["maps"]: self._warn(f"pool {puuid} term {term} ver {ver} not in maps - add placeholder", fname, line) - self._pools[puuid][term]["maps"][ver] = { - "carryover": False, - "from_ver": ver, - "time": "xx/xx-xx:xx:xx.xx", - "rb_gens": {} - } - self._pools[puuid][term]["maps"][ver]["rb_gens"][gen] = { - "op": op, - "start_time": "xx/xx-xx:xx:xx.xx", - "time": "xx/xx-xx:xx:xx.xx", - "started": True, - "scanning": False, - "pulling": False, - "completed": False, - "aborted": False, - "duration": 0 - } + self._pools[puuid][term]["maps"][ver] = self._create_mapver(ver) + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen] = self._create_rbgen(op) + if gen in self._pools[puuid][term]["maps"][ver]["rb_gens"]: existing_op = self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["op"] if op != existing_op: @@ -340,11 +360,18 @@ def _match_ps_rb_status(self, fname, line, pid, rank): self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["completed"] = True elif status == "aborted": self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["aborted"] = True + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["fail_rank"] = fail_rank + elif status == "failed": + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["failed"] = True + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["fail_rank"] = fail_rank + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["time"] = datetime + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["rc"] = rc self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["duration"] = dur if self._debug: print(f"{datetime} FOUND rebuild UPDATE term={term} rb={puuid}/{ver}/{gen}/{op} " - f"STATUS={status}, DUR={dur} seconds rank {rank}\t{host}\tPID {pid}\t{fname}") + f"STATUS={status}, rc={rc}, fail_rank={fail_rank}, DUR={dur} seconds " + f"rank {rank}\t{host}\tPID {pid}\t{fname}") return True def _get_legacy_ps_rb_status_components(self, match): @@ -378,23 +405,9 @@ def _match_legacy_ps_rb_status(self, fname, line, pid, rank): if ver not in self._pools[puuid][term]["maps"]: self._warn(f"pool {puuid} term {term} ver {ver} not in maps - add placeholder", fname, line) - self._pools[puuid][term]["maps"][ver] = { - "carryover": False, - "from_ver": ver, - "time": "xx/xx-xx:xx:xx.xx", - "rb_gens": {} - } - self._pools[puuid][term]["maps"][ver]["rb_gens"][gen] = { - "op": op, - "start_time": "xx/xx-xx:xx:xx.xx", - "time": "xx/xx-xx:xx:xx.xx", - "started": True, - "scanning": False, - "pulling": False, - "completed": False, - "aborted": False, - "duration": 0 - } + self._pools[puuid][term]["maps"][ver] = self._create_mapver(ver) + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen] = self._create_rbgen(op) + if gen in self._pools[puuid][term]["maps"][ver]["rb_gens"]: existing_op = self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["op"] if op != existing_op: @@ -407,6 +420,8 @@ def _match_legacy_ps_rb_status(self, fname, line, pid, rank): self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["completed"] = True elif status == "aborted": self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["aborted"] = True + elif status == "failed": + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["failed"] = True self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["time"] = datetime self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["duration"] = dur if self._debug: @@ -414,6 +429,40 @@ def _match_legacy_ps_rb_status(self, fname, line, pid, rank): f"STATUS={status}, DUR={dur} seconds rank {rank}\t{host}\tPID {pid}\t{fname}") return True + def _get_ps_rb_hung_warn_components(self, match): + # puuid, map version, rebuild-generation, operation, status, duration + # see re_rebuild_ldr_scan_hung and re_rebuild_ldr_pull_hung + return self._get_rb_components(match) + (int(match.group(5)), int(match.group(6))) + + def _match_ps_rb_hung_warn(self, fname, line, pid, rank): + msg, host, datetime = self._get_line_components(line) + match1 = self.re_rebuild_ldr_scan_hung.match(msg) + match2 = self.re_rebuild_ldr_pull_hung.match(msg) + if not match1 and not match2: + return False + + if match1: + puuid, ver, gen, op, compl_eng, tot_eng = self._get_ps_rb_hung_warn_components(match1) + term = self._cur_term[puuid] + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["scan_hung"] = True + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["scan_num_eng_wait"] = compl_eng + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["scan_hung_time"] = datetime + if self._debug: + print(f"{datetime} FOUND rebuild SCAN hung term={term} rb={puuid}/{ver}/{gen}/{op} " + f"{compl_eng} / {tot_eng} done, rank {rank}\t{host}\tPID {pid}\t{fname}") + + if match2: + puuid, ver, gen, op, compl_eng, tot_eng = self._get_ps_rb_hung_warn_components(match2) + term = self._cur_term[puuid] + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["pull_hung"] = True + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["pull_num_eng_wait"] = compl_eng + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["pull_hung_time"] = datetime + if self._debug: + print(f"{datetime} FOUND rebuild PULL hung term={term} rb={puuid}/{ver}/{gen}/{op} " + f"{compl_eng} / {tot_eng} done, rank {rank}\t{host}\tPID {pid}\t{fname}") + + return True + def scan_file(self, log_iter, rank=-1): """Scan a daos engine log file and insert important pool events into a nested dictionary""" fname = log_iter.fname @@ -447,12 +496,17 @@ def scan_file(self, log_iter, rank=-1): if self._match_legacy_ps_rb_start(fname, line, pid, rank): continue + # Find rebuild status updates if self._match_ps_rb_status(fname, line, pid, rank): continue if self._match_legacy_ps_rb_status(fname, line, pid, rank): continue + # Find rebuild scan or pull phase hung warnings + if self._match_ps_rb_hung_warn(fname, line, pid, rank): + continue + # Future: for a PID that is killed, clear any associated cur_ldr_rank / cur_ldr_pid. # At logfile end, it could be due to engine killed, or could just be log rotation. @@ -489,9 +543,16 @@ def print_pools(self): op = rd["op"] dur = rd["duration"] scan = rd["scanning"] + scan_hung = rd["scan_hung"] + scan_num_eng_wait = rd["scan_num_eng_wait"] pull = rd["pulling"] + pull_hung = rd["pull_hung"] + pull_num_eng_wait = rd["pull_num_eng_wait"] comp = rd["completed"] abrt = rd["aborted"] + fail = rd["failed"] + fail_rank = rd["fail_rank"] + rc = rd["rc"] st = rd["start_time"] ut = rd["time"] @@ -501,14 +562,35 @@ def print_pools(self): status = "aborted" elif comp: status = "completed" + elif fail: + status = "failed" elif pull: status = "pulling" elif scan: status = "scanning" + + # hung_status + hung_status = "" + if scan_hung: + hung_status += "scan-hung" + scan_hung_time = rd["scan_hung_time"] + if pull_hung: + if hung_status != "": + hung_status += "," + hung_status += "pull-hung" + pull_hung_time = rd["pull_hung_time"] + + # Print rebuild start, any hang warnings, and latest status updates print(f"{st} {puuid} RBSTRT {v}/{g}/{op}") - updated = scan or pull or comp or abrt - if updated: - print(f"{ut} {puuid} RBUPDT {v}/{g}/{op} {status} {dur} seconds") + if scan_hung: + print(f"{scan_hung_time} {puuid} RBHUNG {v}/{g}/{op} {hung_status}: " + f"{scan_num_eng_wait} engines not done scanning") + if pull_hung: + print(f"{pull_hung_time} {puuid} RBHUNG {v}/{g}/{op} {hung_status}: " + f"{pull_num_eng_wait} engines not done pulling") + if scan or pull or comp or abrt or fail: + print(f"{ut} {puuid} RBUPDT {v}/{g}/{op} {status} rc={rc} " + f"fail_rank={fail_rank} {dur} seconds") # Print term end (if there is a PS leader step_down) if e != "": From bee921514aafb05b9a500c1fcb543bd320d8bb24 Mon Sep 17 00:00:00 2001 From: Kenneth Cain Date: Mon, 27 Jan 2025 22:06:40 -0500 Subject: [PATCH 3/3] fix up some linting issues Features: rebuild Allow-unstable-test: true Signed-off-by: Kenneth Cain --- src/rebuild/srv.c | 8 ++-- src/tests/ftest/cart/util/daos_sys_logscan.py | 42 +++++++++---------- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 03c2fa22b75..ea7fa4e7ac0 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -439,8 +439,8 @@ rebuild_global_status_update(struct rebuild_global_pool_tracker *rgt, rebuild_leader_set_update_time(rgt, iv->riv_rank); D_DEBUG(DB_REBUILD, DF_RB ": iv rank %d scan_done %d pull_done %d resync dtx %u\n", - DP_RB_RGT(rgt), iv->riv_rank, iv->riv_scan_done, iv->riv_pull_done, - iv->riv_dtx_resyc_version); + DP_RB_RGT(rgt), iv->riv_rank, iv->riv_scan_done, iv->riv_pull_done, + iv->riv_dtx_resyc_version); if (!iv->riv_scan_done) { rebuild_leader_set_status(rgt, iv->riv_rank, iv->riv_dtx_resyc_version, 0); @@ -468,8 +468,8 @@ rebuild_global_status_update(struct rebuild_global_pool_tracker *rgt, rebuild_leader_set_status(rgt, iv->riv_rank, iv->riv_dtx_resyc_version, PULL_DONE); D_DEBUG(DB_REBUILD, DF_RB ": rank %d pull done\n", DP_RB_RGT(rgt), iv->riv_rank); if (iv->riv_status != 0) - DL_WARN(iv->riv_status, DF_RB ": rank %u update with failure\n", DP_RB_RGT(rgt), - iv->riv_rank); + DL_WARN(iv->riv_status, DF_RB ": rank %u update with failure", + DP_RB_RGT(rgt), iv->riv_rank); } /* test only: update_and_warn_for_slow_engines(rgt); */ diff --git a/src/tests/ftest/cart/util/daos_sys_logscan.py b/src/tests/ftest/cart/util/daos_sys_logscan.py index be3af7b7066..7facffbe7c7 100755 --- a/src/tests/ftest/cart/util/daos_sys_logscan.py +++ b/src/tests/ftest/cart/util/daos_sys_logscan.py @@ -109,30 +109,26 @@ def _create_mapver(self, ver, carryover=False, tm="xx/xx-xx:xx:xx.xx"): "rb_gens": {} } - def _create_rbgen(self, op, start_time="xx/xx-xx:xx:xx.xx", tm="xx/xx-xx:xx:xx.xx", - started=True, scanning=False, scan_hung=False, - scan_hung_time="xx/xx-xx:xx:xx.xx", scan_num_eng_wait=0, pulling=False, - pull_hung=False, pull_hung_time="xx/xx-xx:xx:xx.xx", pull_num_eng_wait=0, - completed=False, aborted=False, failed=False, fail_rank=-1, rc=0, duration=0): + def _create_rbgen(self, op, start_time="xx/xx-xx:xx:xx.xx"): return { "op": op, "start_time": start_time, - "time": tm, - "started": started, - "scanning": scanning, - "scan_hung": scan_hung, - "scan_hung_time": scan_hung_time, - "scan_num_eng_wait": scan_num_eng_wait, - "pulling": pulling, - "pull_hung": pull_hung, - "pull_hung_time": pull_hung_time, - "pull_num_eng_wait": pull_num_eng_wait, - "completed": completed, - "aborted": aborted, - "failed": failed, - "fail_rank": fail_rank, - "rc": rc, - "duration": duration + "time": "xx/xx-xx:xx:xx.xx", + "started": True, + "scanning": False, + "scan_hung": False, + "scan_hung_time": "xx/xx-xx:xx:xx.xx", + "scan_num_eng_wait": 0, + "pulling": False, + "pull_hung": False, + "pull_hung_time": "xx/xx-xx:xx:xx.xx", + "pull_num_eng_wait": 0, + "completed": False, + "aborted": False, + "failed": False, + "fail_rank": -1, + "rc": 0, + "duration": 0 } def _warn(self, wmsg, fname, line=None): @@ -212,7 +208,7 @@ def _match_ps_step_up(self, fname, line, pid, rank): } if self._debug: print(f"{datetime} FOUND pool {puuid} BEGIN\tterm {term} pmap_versions empty: " - f"{str(pmap_versions == {})} rank {rank}\t{host}\tPID {pid}\t{fname}") + f"{str(not pmap_versions)} rank {rank}\t{host}\tPID {pid}\t{fname}") return True def _match_ps_step_down(self, fname, line, pid, rank): @@ -511,6 +507,8 @@ def scan_file(self, log_iter, rank=-1): # At logfile end, it could be due to engine killed, or could just be log rotation. def print_pools(self): + # pylint: disable=too-many-locals + # pylint: disable=too-many-nested-blocks """Print all pools important events found in a nested dictionary""" # pd (pool dictionary): pool UUID -> td