diff --git a/src/rebuild/rebuild_internal.h b/src/rebuild/rebuild_internal.h index 28a8d6b747f..dda95847ecb 100644 --- a/src/rebuild/rebuild_internal.h +++ b/src/rebuild/rebuild_internal.h @@ -1,5 +1,6 @@ /** * (C) Copyright 2017-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -103,6 +104,7 @@ struct rebuild_tgt_pool_tracker { struct rebuild_server_status { d_rank_t rank; + double last_update; uint32_t dtx_resync_version; uint32_t scan_done:1, pull_done:1; @@ -122,8 +124,11 @@ struct rebuild_global_pool_tracker { /** rebuild status for each server */ struct rebuild_server_status *rgt_servers; - double *rgt_servers_last_update; - double rgt_last_warn; + + /** timestamps for the global operation */ + double rgt_last_warn_ts; /* time of most recent warning log for "slow engines" */ + double rgt_scan_warn_deadline_ts; /* time after which to warn that scan may be hung */ + double rgt_pull_warn_deadline_ts; /* time after which to warn that pull may be hung */ /** indirect indices for binary search by rank */ struct rebuild_server_status **rgt_servers_sorted; diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index c18486e35ec..ea7fa4e7ac0 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -160,6 +160,23 @@ is_rebuild_global_done(struct rebuild_global_pool_tracker *rgt) } +/* determine if "most" engines are done with their current rebuild phase (scan or pull) */ +static bool +is_rebuild_phase_mostly_done(int engines_done_ct, int engines_total_ct) +{ + const int MIN_WAIT_CT = 1; + const int MAX_WAIT_CT = 20; + int wait_ct_threshold; + int engines_waiting_ct = engines_total_ct - engines_done_ct; + + /* When is the global operation "mostly done" (e.g., waiting for 20 or fewer engines)? */ + wait_ct_threshold = engines_done_ct - .95 * engines_total_ct; + wait_ct_threshold = max(MIN_WAIT_CT, wait_ct_threshold); + wait_ct_threshold = min(MAX_WAIT_CT, wait_ct_threshold); + + return (engines_waiting_ct <= wait_ct_threshold); +} + #define SCAN_DONE 0x1 #define PULL_DONE 0x2 @@ -246,7 +263,7 @@ rebuild_leader_set_update_time(struct rebuild_global_pool_tracker *rgt, d_rank_t i = daos_array_find(rgt->rgt_servers_sorted, rgt->rgt_servers_number, rank, &servers_sort_ops); if (i >= 0) { - rgt->rgt_servers_last_update[i] = ABT_get_wtime(); + rgt->rgt_servers[i].last_update = ABT_get_wtime(); return; } D_INFO("rank %u is not included in this rebuild.\n", rank); @@ -320,14 +337,109 @@ rpt_lookup(uuid_t pool_uuid, uint32_t opc, unsigned int ver, unsigned int gen) return found; } +static void +update_and_warn_for_slow_engines(struct rebuild_global_pool_tracker *rgt) +{ + int i; + int scan_ct = 0; + int pull_ct = 0; + int done_ct; + int wait_ct; + bool scan_gl = false; + bool pull_gl = false; + bool do_warn = false; + double now = ABT_get_wtime(); + double tw = now - rgt->rgt_last_warn_ts; /* time since last warning logged */ + bool warned = false; + + /* Throttle warnings to not more often than once per 2 minutes */ + do_warn = (tw >= 120); + + /* Count scan/pull progress and warn for any ranks that haven't provided updates recently */ + for (i = 0; i < rgt->rgt_servers_number; i++) { + double tu = now - rgt->rgt_servers[i].last_update; + d_rank_t r = rgt->rgt_servers[i].rank; + + if (rgt->rgt_servers[i].scan_done) { + scan_ct++; + if (rgt->rgt_servers[i].pull_done) { + pull_ct++; + continue; + } + } + + if (!do_warn) + continue; + + if (tu > 30) { + D_WARN(DF_RB ": no updates from rank %u in %8.3f seconds. " + "scan_done=%d pull_done=%d\n", + DP_RB_RGT(rgt), r, tu, rgt->rgt_servers[i].scan_done, + rgt->rgt_servers[i].pull_done); + warned = true; + } + } + + scan_gl = (scan_ct == rgt->rgt_servers_number); + pull_gl = (pull_ct == rgt->rgt_servers_number); + if (scan_gl && pull_gl) + return; + + /* Determine if scan/pull progress is almost done; mark the time and warn on possible stall. + */ + done_ct = scan_gl ? pull_ct : scan_ct; + wait_ct = rgt->rgt_servers_number - done_ct; +#if 0 + D_DEBUG(DB_TRACE, DF_RB ": s_done=%s, s_ct=%d, p_done=%s, p_ct=%d, servers=%d, d_ct=%d, " + " almost=%s\n", DP_RB_RGT(rgt), scan_gl ? "yes" : "no", scan_ct, pull_gl ? "yes" : "no", + pull_ct, rgt->rgt_servers_number, done_ct, + is_rebuild_phase_mostly_done(done_ct, rgt->rgt_servers_number) ? "yes" : "no"); +#endif + if (is_rebuild_phase_mostly_done(done_ct, rgt->rgt_servers_number)) { + if (!scan_gl && rgt->rgt_scan_warn_deadline_ts == 0.0) { + rgt->rgt_scan_warn_deadline_ts = now + 120.0; + D_DEBUG(DB_REBUILD, DF_RB ": scan almost done, %d/%d engines\n", + DP_RB_RGT(rgt), done_ct, rgt->rgt_servers_number); + } else if (!pull_gl && rgt->rgt_pull_warn_deadline_ts == 0.0) { + rgt->rgt_pull_warn_deadline_ts = now + 120.0; + D_DEBUG(DB_REBUILD, DF_RB ": pull almost done, %d/%d engines\n", + DP_RB_RGT(rgt), done_ct, rgt->rgt_servers_number); + } + + if (!do_warn) + return; + + if (!scan_gl && now > rgt->rgt_scan_warn_deadline_ts) { + D_WARN(DF_RB ": scan hung? waiting for %d/%d engines:\n", DP_RB_RGT(rgt), + wait_ct, rgt->rgt_servers_number); + for (i = 0; i < rgt->rgt_servers_number; i++) + if (!rgt->rgt_servers[i].scan_done) + D_WARN(DF_RB ": rank %u not finished scanning!\n", + DP_RB_RGT(rgt), rgt->rgt_servers[i].rank); + warned = true; + } else if (!pull_gl && now > rgt->rgt_pull_warn_deadline_ts) { + D_WARN(DF_RB ": pull hung? waiting for %d/%d engines:\n", DP_RB_RGT(rgt), + wait_ct, rgt->rgt_servers_number); + for (i = 0; i < rgt->rgt_servers_number; i++) + if (!rgt->rgt_servers[i].pull_done) + D_WARN(DF_RB ": rank %u not finished pulling!\n", + DP_RB_RGT(rgt), rgt->rgt_servers[i].rank); + warned = true; + } + + if (warned) + rgt->rgt_last_warn_ts = now; + } +} + int rebuild_global_status_update(struct rebuild_global_pool_tracker *rgt, struct rebuild_iv *iv) { rebuild_leader_set_update_time(rgt, iv->riv_rank); - D_DEBUG(DB_REBUILD, "iv rank %d scan_done %d pull_done %d resync dtx %u\n", - iv->riv_rank, iv->riv_scan_done, iv->riv_pull_done, + D_DEBUG(DB_REBUILD, DF_RB ": iv rank %d scan_done %d pull_done %d resync dtx %u\n", + DP_RB_RGT(rgt), iv->riv_rank, iv->riv_scan_done, iv->riv_pull_done, iv->riv_dtx_resyc_version); if (!iv->riv_scan_done) { @@ -338,26 +450,30 @@ rebuild_global_status_update(struct rebuild_global_pool_tracker *rgt, if (!is_rebuild_global_scan_done(rgt)) { rebuild_leader_set_status(rgt, iv->riv_rank, iv->riv_dtx_resyc_version, SCAN_DONE); - D_DEBUG(DB_REBUILD, "rebuild ver %d tgt %d scan done\n", - rgt->rgt_rebuild_ver, iv->riv_rank); + D_DEBUG(DB_REBUILD, DF_RB ": rank %d scan done\n", DP_RB_RGT(rgt), iv->riv_rank); /* If global scan is not done, then you can not trust * pull status. But if the rebuild on that target is * failed(riv_status != 0), then the target will report * both scan and pull status to the leader, i.e. they * both can be trusted. */ - if (iv->riv_status == 0) + if (iv->riv_status == 0) { + /* test only: update_and_warn_for_slow_engines(rgt); */ return 0; + } } - /* Only trust pull done if scan is done globally */ + /* Only trust pull done if scan errored or is done globally */ if (iv->riv_pull_done) { - rebuild_leader_set_status(rgt, iv->riv_rank, iv->riv_dtx_resyc_version, - PULL_DONE); - D_DEBUG(DB_REBUILD, "rebuild ver %d tgt %d pull done\n", - rgt->rgt_rebuild_ver, iv->riv_rank); + rebuild_leader_set_status(rgt, iv->riv_rank, iv->riv_dtx_resyc_version, PULL_DONE); + D_DEBUG(DB_REBUILD, DF_RB ": rank %d pull done\n", DP_RB_RGT(rgt), iv->riv_rank); + if (iv->riv_status != 0) + DL_WARN(iv->riv_status, DF_RB ": rank %u update with failure", + DP_RB_RGT(rgt), iv->riv_rank); } + /* test only: update_and_warn_for_slow_engines(rgt); */ + return 0; } @@ -703,39 +819,6 @@ enum { RB_BCAST_QUERY, }; -static void -warn_for_slow_engine_updates(struct rebuild_global_pool_tracker *rgt) -{ - int i; - double now = ABT_get_wtime(); - double tw = now - rgt->rgt_last_warn; /* time since last warning logged */ - bool warned = false; - - /* Throttle warnings to not more often than once per 2 minutes */ - if (tw < 120) - return; - - /* Warn for ranks not done and that haven't provided updates in a while (> 30 sec) */ - for (i = 0; i < rgt->rgt_servers_number; i++) { - double tu = now - rgt->rgt_servers_last_update[i]; - d_rank_t r = rgt->rgt_servers[i].rank; - - if (rgt->rgt_servers[i].scan_done && rgt->rgt_servers[i].pull_done) - continue; - - if (tu > 30) { - D_WARN(DF_RB ": no updates from rank %u in %8.3f seconds. " - "scan_done=%d pull_done=%d\n", - DP_RB_RGT(rgt), r, tu, rgt->rgt_servers[i].scan_done, - rgt->rgt_servers[i].pull_done); - warned = true; - } - } - - if (warned) - rgt->rgt_last_warn = now; -} - /* * Check rebuild status on the leader. Every other target sends * its own rebuild status by IV. @@ -865,7 +948,7 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op, D_PRINT("%s", sbuf); } sleep: - warn_for_slow_engine_updates(rgt); + update_and_warn_for_slow_engines(rgt); sched_req_sleep(rgt->rgt_ult, RBLD_CHECK_INTV); } @@ -882,8 +965,6 @@ rebuild_global_pool_tracker_destroy(struct rebuild_global_pool_tracker *rgt) D_FREE(rgt->rgt_servers); if (rgt->rgt_servers_sorted) D_FREE(rgt->rgt_servers_sorted); - if (rgt->rgt_servers_last_update) - D_FREE(rgt->rgt_servers_last_update); if (rgt->rgt_lock) ABT_mutex_free(&rgt->rgt_lock); @@ -921,16 +1002,13 @@ rebuild_global_pool_tracker_create(struct ds_pool *pool, uint32_t ver, uint32_t D_ALLOC_ARRAY(rgt->rgt_servers_sorted, rank_nr); if (rgt->rgt_servers_sorted == NULL) D_GOTO(out, rc = -DER_NOMEM); - D_ALLOC_ARRAY(rgt->rgt_servers_last_update, rank_nr); - if (rgt->rgt_servers_last_update == NULL) - D_GOTO(out, rc = -DER_NOMEM); now = ABT_get_wtime(); - rgt->rgt_last_warn = now; + rgt->rgt_last_warn_ts = now; for (i = 0; i < rank_nr; i++) { rgt->rgt_servers_sorted[i] = &rgt->rgt_servers[i]; rgt->rgt_servers[i].rank = doms[i].do_comp.co_rank; - rgt->rgt_servers_last_update[i] = now; + rgt->rgt_servers[i].last_update = now; } rgt->rgt_servers_number = rank_nr; diff --git a/src/tests/ftest/cart/util/daos_sys_logscan.py b/src/tests/ftest/cart/util/daos_sys_logscan.py index 9b5b5e6e78c..35ac2b729cb 100755 --- a/src/tests/ftest/cart/util/daos_sys_logscan.py +++ b/src/tests/ftest/cart/util/daos_sys_logscan.py @@ -17,7 +17,6 @@ class SysPools(): """Directory of Pools and Summary Activities Found in Engine Log Files""" # Future possibilities include: - # diagram of nested dictionaries constructed in the comments # system map update events (output outside if pool-specific context) # SWIM events seen by PS leader? # add/remove target events on PS leader? @@ -43,7 +42,9 @@ class SysPools(): re_pmap_update = re.compile(upd_re) # uniform rebuild string identifier rb=/// - rbid_re = r"rb=([0-9a-fA-F]{8})/(\d+)/(\d+)/(\w+)" + # need special "_rf" versions of regex's to match "Reclaim fail" op (as opposed to "Reclaim") + rbid_re = r"rb=([0-9a-fA-F]{8})/(\d+)/(\d+)/(Rebuild|Reclaim)" + rbid_rf_re = r"rb=([0-9a-fA-F]{8})/(\d+)/(\d+)/(Reclaim fail)" # Future possibility: match the rebuild preliminary steps # rebuild_task_ult() wait for scheduling, and map dist - both would info to match on. @@ -51,18 +52,47 @@ class SysPools(): # re.compile(r"rebuild_task_ult\(\).*map_dist_ver (\d+) map ver (\d+)") # Rebuild: PS leader engine starting and status checking a given operation - # statuses: "scanning", "pulling", "completed", "aborted" + # statuses: "scanning", "pulling", "completed", "aborted", "failed" ldr_start_re = "rebuild_leader_start.*" + rbid_re + "$" - ldr_status_re = r"rebuild_leader_status_check\(\).*" + rbid_re + r" \[(\w+)\].*duration=(\d+)" + ldr_start_rf_re = "rebuild_leader_start.*" + rbid_rf_re + "$" + + ldr_status_re = r"rebuild_leader_status_check\(\).*" + rbid_re + r" \[(\w+)\]" + \ + r".*status (-?\d+)/(\d+) .*duration=(\d+)" + ldr_status_rf_re = r"rebuild_leader_status_check\(\).*" + rbid_rf_re + r" \[(\w+)\]" + \ + r".*status (-?\d+)/(\d+) .*duration=(\d+)" + ldr_scan_hung_re = r"update_and_warn_for_slow_engines\(\).*" + rbid_re + \ + r".*scan hung.*waiting for (\d+)/(\d+) engines" + ldr_scan_hung_rf_re = r"update_and_warn_for_slow_engines\(\).*" + rbid_rf_re + \ + r".*scan hung.*waiting for (\d+)/(\d+) engines" + ldr_pull_hung_re = r"update_and_warn_for_slow_engines\(\).*" + rbid_re + \ + r".*pull hung.*waiting for (\d+)/(\d+) engines" + ldr_pull_hung_rf_re = r"update_and_warn_for_slow_engines\(\).*" + rbid_rf_re + \ + r".*pull hung.*waiting for (\d+)/(\d+) engines" re_rebuild_ldr_start = re.compile(ldr_start_re) + re_rebuild_ldr_start_rf = re.compile(ldr_start_rf_re) + re_rebuild_ldr_status = re.compile(ldr_status_re) + re_rebuild_ldr_status_rf = re.compile(ldr_status_rf_re) + re_rebuild_ldr_scan_hung = re.compile(ldr_scan_hung_re) + re_rebuild_ldr_scan_hung_rf = re.compile(ldr_scan_hung_rf_re) + re_rebuild_ldr_pull_hung = re.compile(ldr_pull_hung_re) + re_rebuild_ldr_pull_hung_rf = re.compile(ldr_pull_hung_rf_re) # Legacy rebuild PS leader logging (before uniform rebuild string) - old_ldr_start_re = r"rebuild_leader_start.*([0-9a-fA-F]{8}).*version=(\d+)/(\d+).*op=(\w+)" - old_ldr_status_re = (r"rebuild_leader_status_check\(\) (\w+) \[(\w+)\] \(pool ([0-9a-fA-F]{8}) " - r"leader (\d+) term (\d+).*ver=(\d+),gen (\d+).*duration=(\d+) secs") + old_ldr_start_re = \ + r"rebuild_leader_start.*([0-9a-fA-F]{8}).*version=(\d+)/(\d+).*op=(Rebuild|Reclaim)" + old_ldr_start_rf_re = \ + r"rebuild_leader_start.*([0-9a-fA-F]{8}).*version=(\d+)/(\d+).*op=(Reclaim fail)" + old_ldr_status_re = \ + (r"rebuild_leader_status_check\(\) (Rebuild|Reclaim) \[(\w+)\] \(pool ([0-9a-fA-F]{8}) " + r"leader (\d+) term (\d+).*ver=(\d+),gen (\d+).*duration=(\d+) secs") + old_ldr_status_rf_re = \ + (r"rebuild_leader_status_check\(\) (Reclaim fail) \[(\w+)\] \(pool ([0-9a-fA-F]{8}) " + r"leader (\d+) term (\d+).*ver=(\d+),gen (\d+).*duration=(\d+) secs") re_old_ldr_start = re.compile(old_ldr_start_re) + re_old_ldr_start_rf = re.compile(old_ldr_start_rf_re) re_old_ldr_status = re.compile(old_ldr_status_re) + re_old_ldr_status_rf = re.compile(old_ldr_status_rf_re) def __init__(self): # dictionaries indexed by pool UUID @@ -80,6 +110,72 @@ def __init__(self): self._check_rb_legacy_fmt = True self._debug = False + # other nested dictionaries within self._pools will be built-up + # pool leadership terms dictionary indexed by integer term number + # self._pools[puuid][term] -> {rank, begin_time, end_time, host, pid, logfile, maps={}} + # + # pool map versions dictionary indexed by integer pool map version + # self._pools[puuid][term]["maps"][mapver] = {carryover, from_ver, time, rb_gens={}} + # + # rebuild generations dictionary indexed by integer rebuild generation number + # contains: rebuild operation, start time, duration, + # status(started/completed/aborted/failed/scanning/pulling), + # and if any scan or pull hang warnings were logged by PS leader + # self._pools[puuid][term]["maps"][mapver]["rb_gens"][gen] = + # {op, start_time, time, started, scanning, scan_hung, scan_hung_time, scan_num_eng_wait, + # pulling, pull_hung, pull_hung_time, pull_num_eng_wait, completed, aborted, failed, dur) + + def _create_term(self, puuid, term, rank, host, pid, logfile, begin_t="xx/xx-xx:xx:xx.xx", + end_t="xx/xx-xx:xx:xx.xx"): + # carry over most recent map version into the new term, avoid later KeyError + old_term = term - 1 + pmap_versions = {} + if old_term in self._pools[puuid]: + if self._pools and self._pools[puuid][old_term]["maps"] != {}: + last_mapver = max(self._pools[puuid][old_term]["maps"].keys()) + pmap_versions[last_mapver] = self._pools[puuid][old_term]["maps"][last_mapver] + pmap_versions[last_mapver]["carryover"] = True + + return { + "rank": rank, + "begin_time": begin_t, + "end_time": end_t, + "host": host, + "pid": pid, + "logfile": logfile, + "maps": pmap_versions + } + + def _create_mapver(self, ver, carryover=False, tm="xx/xx-xx:xx:xx.xx"): + return { + "carryover": carryover, + "from_ver": ver, + "time": tm, + "rb_gens": {} + } + + def _create_rbgen(self, op, start_time="xx/xx-xx:xx:xx.xx"): + return { + "op": op, + "start_time": start_time, + "time": "xx/xx-xx:xx:xx.xx", + "started": True, + "scanning": False, + "scan_hung": False, + "scan_hung_time": "xx/xx-xx:xx:xx.xx", + "scan_num_eng_wait": 0, + "pulling": False, + "pull_hung": False, + "pull_hung_time": "xx/xx-xx:xx:xx.xx", + "pull_num_eng_wait": 0, + "completed": False, + "aborted": False, + "failed": False, + "fail_rank": -1, + "rc": 0, + "duration": 0 + } + def _warn(self, wmsg, fname, line=None): full_msg = f"WARN file={fname}" if line: @@ -135,30 +231,14 @@ def _match_ps_step_up(self, fname, line, pid, rank): self._cur_ldr_rank[puuid] = rank self._cur_ldr_pid[puuid] = pid self._cur_term[puuid] = term - old_term = term - 1 # if term already exists, error? if term in self._pools[puuid]: self._warn(f"pool {puuid} term {term} already seen!", fname, line) - # carry over most recent map version into the new term, avoid later KeyError - if old_term in self._pools: - if self._pools and self._pools[puuid][old_term]["maps"] != {}: - last_mapver = max(self._pools[puuid][old_term]["maps"].keys()) - pmap_versions = self._pools[puuid][old_term]["maps"][last_mapver] - pmap_versions["carryover"] = True - else: - pmap_versions = {} - self._pools[puuid][term] = { - "rank": rank, - "begin_time": datetime, - "end_time": "", - "host": host, - "pid": pid, - "logfile": fname, - "maps": pmap_versions - } + self._pools[puuid][term] = self._create_term(puuid, term, rank, host, pid, fname, + begin_t=datetime) if self._debug: - print(f"{datetime} FOUND pool {puuid} BEGIN\tterm {term} pmap_versions empty: " - f"{str(pmap_versions == {})} rank {rank}\t{host}\tPID {pid}\t{fname}") + print(f"{datetime} FOUND pool {puuid} BEGIN\tterm {term} rank {rank}\t{host}" + f"\tPID {pid}\t{fname}") return True def _match_ps_step_down(self, fname, line, pid, rank): @@ -168,12 +248,22 @@ def _match_ps_step_down(self, fname, line, pid, rank): return False puuid, term = self._get_ps_leader_components(match) + if puuid not in self._pools: + self._pools[puuid] = {} + self._cur_term[puuid] = -1 + if term != self._cur_term[puuid]: - self._warn(f"step_down term={term} != cur_term={self._cur_term}", fname, line) + self._warn(f"pool {puuid} step_down term={term} != cur_term={self._cur_term[puuid]}", + fname, line) + if term not in self._pools[puuid]: + self._pools[puuid][term] = self._create_term(puuid, term, rank, host, pid, fname, + end_t=datetime) + else: + self._pools[puuid][term]["end_time"] = datetime + self._cur_ldr_rank[puuid] = -1 self._cur_ldr_pid[puuid] = -1 self._cur_term[puuid] = -1 - self._pools[puuid][term]["end_time"] = datetime if self._debug: print(f"{datetime} FOUND pool {puuid} END\tterm {term} rank {rank}\t{host}\t" f"PID {pid}\t{fname}") @@ -195,12 +285,8 @@ def _match_ps_pmap_update(self, fname, line, pid, rank): if not self._is_leader(puuid, rank, pid): return True term = self._cur_term[puuid] - self._pools[puuid][term]["maps"][to_ver] = { - "carryover": False, - "from_ver": from_ver, - "time": datetime, - "rb_gens": {} - } + self._pools[puuid][term]["maps"][to_ver] = \ + self._create_mapver(from_ver, carryover=False, tm=datetime) if self._debug: print(f"FOUND pool {puuid} map update {from_ver}->{to_ver} rank {rank}\t{host}\t" f"PID {pid}\t{fname}") @@ -217,9 +303,11 @@ def _match_ps_rb_start(self, fname, line, pid, rank): if not self._check_rb_new_fmt: return False msg, host, datetime = self._get_line_components(line) - match = self.re_rebuild_ldr_start.match(msg) + match = self.re_rebuild_ldr_start_rf.match(msg) if not match: - return False + match = self.re_rebuild_ldr_start.match(msg) + if not match: + return False # Disable checking for legacy rebuild log format, to save execution time self._check_rb_legacy_fmt = False @@ -230,20 +318,17 @@ def _match_ps_rb_start(self, fname, line, pid, rank): if term < 1: self._warn(f"pool {puuid} I don't know what term it is ({term})!", fname, line) return True + + if ver not in self._pools[puuid][term]["maps"]: + self._warn(f"pool {puuid} term {term} ver {ver} not in maps - add placeholder", + fname, line) + self._pools[puuid][term]["maps"][ver] = self._create_mapver(ver) + if gen in self._pools[puuid][term]["maps"][ver]["rb_gens"]: self._warn(f"pool {puuid} term {term} ver {ver} already has gen {gen}", fname, line) # Future possibility: keep timestamps, durations for scan start, pull start, completed - self._pools[puuid][term]["maps"][ver]["rb_gens"][gen] = { - "op": op, - "start_time": datetime, - "time": "xx/xx-xx:xx:xx.xx", - "started": True, - "scanning": False, - "pulling": False, - "completed": False, - "aborted": False, - "duration": 0 - } + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen] = \ + self._create_rbgen(op, start_time=datetime) if self._debug: print(f"{datetime} FOUND rebuild start in term {term}, rb={puuid}/{ver}/{gen}/{op} " f"rank {rank}\t{host}\tPID {pid}\t{fname}") @@ -254,9 +339,11 @@ def _match_legacy_ps_rb_start(self, fname, line, pid, rank): if not self._check_rb_legacy_fmt: return False msg, host, datetime = self._get_line_components(line) - match = self.re_old_ldr_start.match(msg) + match = self.re_old_ldr_start_rf.match(msg) if not match: - return False + match = match = self.re_old_ldr_start.match(msg) + if not match: + return False # Disable checking for new rebuild log format, to save execution time self._check_rb_new_fmt = False @@ -267,41 +354,42 @@ def _match_legacy_ps_rb_start(self, fname, line, pid, rank): if term < 1: self._warn(f"pool {puuid} I don't know what term it is ({term})!", fname, line) return True + + if ver not in self._pools[puuid][term]["maps"]: + self._warn(f"pool {puuid} term {term} ver {ver} not in maps - add placeholder", + fname, line) + self._pools[puuid][term]["maps"][ver] = self._create_mapver(ver) + if gen in self._pools[puuid][term]["maps"][ver]["rb_gens"]: self._warn(f"pool {puuid} term {term} ver {ver} already has gen {gen}", fname, line) - self._pools[puuid][term]["maps"][ver]["rb_gens"][gen] = { - "op": op, - "start_time": datetime, - "time": "xx/xx-xx:xx:xx.xx", - "started": True, - "scanning": False, - "pulling": False, - "completed": False, - "aborted": False, - "duration": 0 - } + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen] = \ + self._create_rbgen(op, start_time=datetime) + if self._debug: print(f"{datetime} FOUND rebuild start in term {term}, rb={puuid}/{ver}/{gen}/{op} " f"rank {rank}\t{host}\tPID {pid}\t{fname}") return True def _get_ps_rb_status_components(self, match): - # puuid, map version, rebuild-generation, operation, status, duration + # puuid, map version, rebuild-generation, operation, status, rc, fail_rank, duration # see re_rebuild_ldr_status - return self._get_rb_components(match) + (match.group(5), int(match.group(6))) + return self._get_rb_components(match) + (match.group(5), int(match.group(6)), + int(match.group(7)), int(match.group(8))) def _match_ps_rb_status(self, fname, line, pid, rank): # Do not match on new rebuild log format if we found legacy format if not self._check_rb_new_fmt: return False msg, host, datetime = self._get_line_components(line) - match = self.re_rebuild_ldr_status.match(msg) + match = self.re_rebuild_ldr_status_rf.match(msg) if not match: - return False + match = self.re_rebuild_ldr_status.match(msg) + if not match: + return False # Disable checking for legacy rebuild log format, to save execution time self._check_rb_legacy_fmt = False - puuid, ver, gen, op, status, dur = self._get_ps_rb_status_components(match) + puuid, ver, gen, op, status, rc, fail_rank, dur = self._get_ps_rb_status_components(match) if not self._is_leader(puuid, rank, pid): return True term = self._cur_term[puuid] @@ -311,23 +399,9 @@ def _match_ps_rb_status(self, fname, line, pid, rank): if ver not in self._pools[puuid][term]["maps"]: self._warn(f"pool {puuid} term {term} ver {ver} not in maps - add placeholder", fname, line) - self._pools[puuid][term]["maps"][ver] = { - "carryover": False, - "from_ver": ver, - "time": "xx/xx-xx:xx:xx.xx", - "rb_gens": {} - } - self._pools[puuid][term]["maps"][ver]["rb_gens"][gen] = { - "op": op, - "start_time": "xx/xx-xx:xx:xx.xx", - "time": "xx/xx-xx:xx:xx.xx", - "started": True, - "scanning": False, - "pulling": False, - "completed": False, - "aborted": False, - "duration": 0 - } + self._pools[puuid][term]["maps"][ver] = self._create_mapver(ver) + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen] = self._create_rbgen(op) + if gen in self._pools[puuid][term]["maps"][ver]["rb_gens"]: existing_op = self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["op"] if op != existing_op: @@ -340,11 +414,18 @@ def _match_ps_rb_status(self, fname, line, pid, rank): self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["completed"] = True elif status == "aborted": self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["aborted"] = True + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["fail_rank"] = fail_rank + elif status == "failed": + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["failed"] = True + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["fail_rank"] = fail_rank + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["time"] = datetime + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["rc"] = rc self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["duration"] = dur if self._debug: print(f"{datetime} FOUND rebuild UPDATE term={term} rb={puuid}/{ver}/{gen}/{op} " - f"STATUS={status}, DUR={dur} seconds rank {rank}\t{host}\tPID {pid}\t{fname}") + f"STATUS={status}, rc={rc}, fail_rank={fail_rank}, DUR={dur} seconds " + f"rank {rank}\t{host}\tPID {pid}\t{fname}") return True def _get_legacy_ps_rb_status_components(self, match): @@ -358,9 +439,11 @@ def _match_legacy_ps_rb_status(self, fname, line, pid, rank): if not self._check_rb_legacy_fmt: return False msg, host, datetime = self._get_line_components(line) - match = self.re_old_ldr_status.match(msg) + match = self.re_old_ldr_status_rf.match(msg) if not match: - return False + match = self.re_old_ldr_status.match(msg) + if not match: + return False # Disable checking for new rebuild log format, to save execution time self._check_rb_new_fmt = False @@ -378,23 +461,9 @@ def _match_legacy_ps_rb_status(self, fname, line, pid, rank): if ver not in self._pools[puuid][term]["maps"]: self._warn(f"pool {puuid} term {term} ver {ver} not in maps - add placeholder", fname, line) - self._pools[puuid][term]["maps"][ver] = { - "carryover": False, - "from_ver": ver, - "time": "xx/xx-xx:xx:xx.xx", - "rb_gens": {} - } - self._pools[puuid][term]["maps"][ver]["rb_gens"][gen] = { - "op": op, - "start_time": "xx/xx-xx:xx:xx.xx", - "time": "xx/xx-xx:xx:xx.xx", - "started": True, - "scanning": False, - "pulling": False, - "completed": False, - "aborted": False, - "duration": 0 - } + self._pools[puuid][term]["maps"][ver] = self._create_mapver(ver) + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen] = self._create_rbgen(op) + if gen in self._pools[puuid][term]["maps"][ver]["rb_gens"]: existing_op = self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["op"] if op != existing_op: @@ -407,6 +476,8 @@ def _match_legacy_ps_rb_status(self, fname, line, pid, rank): self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["completed"] = True elif status == "aborted": self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["aborted"] = True + elif status == "failed": + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["failed"] = True self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["time"] = datetime self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["duration"] = dur if self._debug: @@ -414,6 +485,44 @@ def _match_legacy_ps_rb_status(self, fname, line, pid, rank): f"STATUS={status}, DUR={dur} seconds rank {rank}\t{host}\tPID {pid}\t{fname}") return True + def _get_ps_rb_hung_warn_components(self, match): + # puuid, map version, rebuild-generation, operation, status, duration + # see re_rebuild_ldr_scan_hung and re_rebuild_ldr_pull_hung + return self._get_rb_components(match) + (int(match.group(5)), int(match.group(6))) + + def _match_ps_rb_hung_warn(self, fname, line, pid, rank): + msg, host, datetime = self._get_line_components(line) + match1 = self.re_rebuild_ldr_scan_hung_rf.match(msg) + if not match1: + match1 = self.re_rebuild_ldr_scan_hung.match(msg) + match2 = self.re_rebuild_ldr_pull_hung_rf.match(msg) + if not match2: + match2 = self.re_rebuild_ldr_pull_hung.match(msg) + if not match1 and not match2: + return False + + if match1: + puuid, ver, gen, op, compl_eng, tot_eng = self._get_ps_rb_hung_warn_components(match1) + term = self._cur_term[puuid] + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["scan_hung"] = True + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["scan_num_eng_wait"] = compl_eng + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["scan_hung_time"] = datetime + if self._debug: + print(f"{datetime} FOUND rebuild SCAN hung term={term} rb={puuid}/{ver}/{gen}/{op} " + f"{compl_eng} / {tot_eng} done, rank {rank}\t{host}\tPID {pid}\t{fname}") + + if match2: + puuid, ver, gen, op, compl_eng, tot_eng = self._get_ps_rb_hung_warn_components(match2) + term = self._cur_term[puuid] + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["pull_hung"] = True + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["pull_num_eng_wait"] = compl_eng + self._pools[puuid][term]["maps"][ver]["rb_gens"][gen]["pull_hung_time"] = datetime + if self._debug: + print(f"{datetime} FOUND rebuild PULL hung term={term} rb={puuid}/{ver}/{gen}/{op} " + f"{compl_eng} / {tot_eng} done, rank {rank}\t{host}\tPID {pid}\t{fname}") + + return True + def scan_file(self, log_iter, rank=-1): """Scan a daos engine log file and insert important pool events into a nested dictionary""" fname = log_iter.fname @@ -447,16 +556,23 @@ def scan_file(self, log_iter, rank=-1): if self._match_legacy_ps_rb_start(fname, line, pid, rank): continue + # Find rebuild status updates if self._match_ps_rb_status(fname, line, pid, rank): continue if self._match_legacy_ps_rb_status(fname, line, pid, rank): continue + # Find rebuild scan or pull phase hung warnings + if self._match_ps_rb_hung_warn(fname, line, pid, rank): + continue + # Future: for a PID that is killed, clear any associated cur_ldr_rank / cur_ldr_pid. # At logfile end, it could be due to engine killed, or could just be log rotation. def print_pools(self): + # pylint: disable=too-many-locals + # pylint: disable=too-many-nested-blocks """Print all pools important events found in a nested dictionary""" # pd (pool dictionary): pool UUID -> td @@ -489,9 +605,16 @@ def print_pools(self): op = rd["op"] dur = rd["duration"] scan = rd["scanning"] + scan_hung = rd["scan_hung"] + scan_num_eng_wait = rd["scan_num_eng_wait"] pull = rd["pulling"] + pull_hung = rd["pull_hung"] + pull_num_eng_wait = rd["pull_num_eng_wait"] comp = rd["completed"] abrt = rd["aborted"] + fail = rd["failed"] + fail_rank = rd["fail_rank"] + rc = rd["rc"] st = rd["start_time"] ut = rd["time"] @@ -501,14 +624,35 @@ def print_pools(self): status = "aborted" elif comp: status = "completed" + elif fail: + status = "failed" elif pull: status = "pulling" elif scan: status = "scanning" + + # hung_status + hung_status = "" + if scan_hung: + hung_status += "scan-hung" + scan_hung_time = rd["scan_hung_time"] + if pull_hung: + if hung_status != "": + hung_status += "," + hung_status += "pull-hung" + pull_hung_time = rd["pull_hung_time"] + + # Print rebuild start, any hang warnings, and latest status updates print(f"{st} {puuid} RBSTRT {v}/{g}/{op}") - updated = scan or pull or comp or abrt - if updated: - print(f"{ut} {puuid} RBUPDT {v}/{g}/{op} {status} {dur} seconds") + if scan_hung: + print(f"{scan_hung_time} {puuid} RBHUNG {v}/{g}/{op} {hung_status}: " + f"{scan_num_eng_wait} engines not done scanning") + if pull_hung: + print(f"{pull_hung_time} {puuid} RBHUNG {v}/{g}/{op} {hung_status}: " + f"{pull_num_eng_wait} engines not done pulling") + if scan or pull or comp or abrt or fail: + print(f"{ut} {puuid} RBUPDT {v}/{g}/{op} {status} rc={rc} " + f"fail_rank={fail_rank} {dur} seconds") # Print term end (if there is a PS leader step_down) if e != "": diff --git a/src/tests/ftest/daos_test/rebuild.py b/src/tests/ftest/daos_test/rebuild.py index 62c66b7da21..012b157d42c 100644 --- a/src/tests/ftest/daos_test/rebuild.py +++ b/src/tests/ftest/daos_test/rebuild.py @@ -1,5 +1,6 @@ ''' (C) Copyright 2018-2024 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -365,3 +366,19 @@ def test_rebuild_35(self): :avocado: tags=DaosCoreTestRebuild,daos_test,daos_core_test_rebuild,test_rebuild_35 """ self.run_subtest() + + def test_rebuild_36(self): + """Jira ID: DAOS-16111 + + Test Description: + Run daos_test -r -s5 -u subtests=36 + + Use cases: + Core tests for daos_test rebuild + + :avocado: tags=all,full_regression + :avocado: tags=hw,medium + :avocado: tags=unittest,rebuild + :avocado: tags=DaosCoreTestRebuild,daos_test,daos_core_test_rebuild,test_rebuild_36 + """ + self.run_subtest() diff --git a/src/tests/ftest/daos_test/rebuild.yaml b/src/tests/ftest/daos_test/rebuild.yaml index 9ceb3c02afb..0cac7bcffb4 100644 --- a/src/tests/ftest/daos_test/rebuild.yaml +++ b/src/tests/ftest/daos_test/rebuild.yaml @@ -13,6 +13,7 @@ timeouts: test_rebuild_33: 200 test_rebuild_34: 200 test_rebuild_35: 180 + test_rebuild_36: 250 pool: nvme_size: 0G @@ -81,6 +82,7 @@ daos_tests: test_rebuild_33: DAOS_Rebuild_33 test_rebuild_34: DAOS_Rebuild_34 test_rebuild_35: DAOS_Rebuild_35 + test_rebuild_36: DAOS_Rebuild_36 daos_test: test_rebuild_0to10: r test_rebuild_12to15: r @@ -104,6 +106,7 @@ daos_tests: test_rebuild_33: r test_rebuild_34: r test_rebuild_35: r + test_rebuild_36: r args: test_rebuild_0to10: -s3 -u subtests="0-10" test_rebuild_12to15: -s3 -u subtests="12-15" @@ -127,6 +130,7 @@ daos_tests: test_rebuild_33: -s5 -u subtests="33" test_rebuild_34: -s5 -u subtests="34" test_rebuild_35: -s5 -u subtests="35" + test_rebuild_36: -s5 -u subtests="36" stopped_ranks: test_rebuild_26: ["random"] test_rebuild_27: ["random"] diff --git a/src/tests/suite/daos_rebuild.c b/src/tests/suite/daos_rebuild.c index 20a751504ac..e76c02cf56f 100644 --- a/src/tests/suite/daos_rebuild.c +++ b/src/tests/suite/daos_rebuild.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1563,98 +1564,113 @@ rebuild_cont_destroy_and_reintegrate(void **state) reintegrate_single_pool_rank(arg, 5, true); } +static void +rebuild_long_scan_hang(void **state) +{ + test_arg_t *arg = *state; + d_rank_t fault_rank = 1; + d_rank_t excl_rank = 0; + int rc; + + if (!test_runable(arg, 7) || arg->pool.alive_svc->rl_nr < 5) { + print_message("need at least 5 svcs, -s5\n"); + return; + } + + print_message("inject DAOS_REBUILD_TGT_SCAN_HANG fault into engine rank %u\n", fault_rank); + test_set_engine_fail_loc(arg, fault_rank, DAOS_REBUILD_TGT_SCAN_HANG | DAOS_FAIL_ALWAYS); + + print_message("exclude engine rank %u to cause rebuild, and delay > 2 minutes\n", + excl_rank); + rc = dmg_pool_exclude(arg->dmg_config, arg->pool.pool_uuid, arg->group, excl_rank, -1); + assert_success(rc); + + /* wait a long time, cause PS leader engine logic to warn about "almost done" rebuild + * see update_and_warn_for_slow_engines() + */ + sleep(130); + print_message("delay done, clear fault on engine rank %u and wait for rebuild\n", + fault_rank); + test_set_engine_fail_loc(arg, fault_rank, 0); + test_rebuild_wait(&arg, 1); + print_message("rebuild done\n"); +} + /** create a new pool/container for each test */ static const struct CMUnitTest rebuild_tests[] = { - {"REBUILD0: drop rebuild scan reply", - rebuild_drop_scan, rebuild_small_sub_setup, rebuild_sub_teardown}, - {"REBUILD1: retry rebuild for not ready", - rebuild_retry_rebuild, rebuild_small_sub_setup, rebuild_sub_teardown}, - {"REBUILD2: drop rebuild obj reply", - rebuild_drop_obj, rebuild_small_sub_setup, rebuild_sub_teardown}, - {"REBUILD3: rebuild multiple pools", - rebuild_multiple_pools, rebuild_sub_setup, rebuild_sub_teardown}, - {"REBUILD4: rebuild update failed", - rebuild_update_failed, rebuild_small_sub_setup, rebuild_sub_teardown}, - {"REBUILD5: retry rebuild for pool stale", - rebuild_retry_for_stale_pool, rebuild_small_sub_setup, - rebuild_sub_teardown}, - {"REBUILD6: rebuild with container destroy", - rebuild_destroy_container, rebuild_sub_setup, rebuild_sub_teardown}, - {"REBUILD7: rebuild with container close", - rebuild_close_container, rebuild_small_sub_setup, rebuild_sub_teardown}, - {"REBUILD8: rebuild with pool destroy during scan", - rebuild_destroy_pool_during_scan, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD9: rebuild with pool destroy during rebuild", - rebuild_destroy_pool_during_rebuild, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD10: rebuild iv tgt fail", - rebuild_iv_tgt_fail, rebuild_small_sub_setup, rebuild_sub_teardown}, - {"REBUILD11: rebuild tgt start fail", - rebuild_tgt_start_fail, rebuild_small_sub_setup, rebuild_sub_teardown}, - {"REBUILD12: rebuild send objects failed", - rebuild_send_objects_fail, rebuild_small_sub_setup, - rebuild_sub_teardown}, - {"REBUILD13: rebuild empty pool offline", - rebuild_offline_empty, rebuild_small_sub_setup, rebuild_sub_teardown}, - {"REBUILD14: rebuild no space failure", - rebuild_nospace, rebuild_small_sub_setup, rebuild_sub_teardown}, - {"REBUILD15: rebuild multiple tgts", - rebuild_multiple_tgts, rebuild_small_sub_setup, rebuild_sub_teardown}, - {"REBUILD16: disconnect pool during scan", - rebuild_tgt_pool_disconnect_in_scan, rebuild_small_sub_setup, - rebuild_sub_teardown}, - {"REBUILD17: disconnect pool during rebuild", - rebuild_tgt_pool_disconnect_in_rebuild, rebuild_small_sub_setup, - rebuild_sub_teardown}, - {"REBUILD18: multi-pools rebuild concurrently", - multi_pools_rebuild_concurrently, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD19: rebuild with master change during scan", - rebuild_master_change_during_scan, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD20: rebuild with master change during rebuild", - rebuild_master_change_during_rebuild, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD21: rebuild with master failure", - rebuild_master_failure, rebuild_sub_setup, rebuild_sub_teardown}, - {"REBUILD22: connect pool during scan for offline rebuild", - rebuild_offline_pool_connect_in_scan, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD23: connect pool during rebuild for offline rebuild", - rebuild_offline_pool_connect_in_rebuild, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD24: offline rebuild", - rebuild_offline, rebuild_sub_setup, rebuild_sub_teardown}, - {"REBUILD25: rebuild with two failures", - rebuild_multiple_failures, rebuild_sub_setup, rebuild_sub_teardown}, - {"REBUILD26: rebuild fail all replicas before rebuild", - rebuild_fail_all_replicas_before_rebuild, rebuild_sub_rf1_setup, - rebuild_sub_teardown}, - {"REBUILD27: rebuild fail all replicas", - rebuild_fail_all_replicas, rebuild_sub_setup, rebuild_sub_teardown}, - {"REBUILD28: rebuild kill rank during rebuild", - rebuild_kill_rank_during_rebuild, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD29: rebuild kill PS leader during rebuild", - rebuild_kill_PS_leader_during_rebuild, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD30: destroy pool during rebuild failure and retry", - rebuild_pool_destroy_during_rebuild_failure, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD31: reintegrate failure and retry", - reintegrate_failure_and_retry, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD32: kill more ranks than RF, then reintegrate", - rebuild_kill_more_RF_ranks, rebuild_sub_setup, - rebuild_sub_teardown}, - {"REBUILD33: delay rebuild and extend", - rebuild_delay_and_reintegrate, rebuild_sub_setup, rebuild_sub_teardown}, - {"REBUILD34: delay rebuild and extend", - rebuild_delay_and_extend, rebuild_sub_6nodes_rf1_setup, rebuild_sub_teardown}, - {"REBUILD35: destroy container then reintegrate", - rebuild_cont_destroy_and_reintegrate, rebuild_sub_6nodes_rf1_setup, - rebuild_sub_teardown}, + {"REBUILD0: drop rebuild scan reply", rebuild_drop_scan, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD1: retry rebuild for not ready", rebuild_retry_rebuild, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD2: drop rebuild obj reply", rebuild_drop_obj, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD3: rebuild multiple pools", rebuild_multiple_pools, rebuild_sub_setup, + rebuild_sub_teardown}, + {"REBUILD4: rebuild update failed", rebuild_update_failed, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD5: retry rebuild for pool stale", rebuild_retry_for_stale_pool, + rebuild_small_sub_setup, rebuild_sub_teardown}, + {"REBUILD6: rebuild with container destroy", rebuild_destroy_container, rebuild_sub_setup, + rebuild_sub_teardown}, + {"REBUILD7: rebuild with container close", rebuild_close_container, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD8: rebuild with pool destroy during scan", rebuild_destroy_pool_during_scan, + rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD9: rebuild with pool destroy during rebuild", rebuild_destroy_pool_during_rebuild, + rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD10: rebuild iv tgt fail", rebuild_iv_tgt_fail, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD11: rebuild tgt start fail", rebuild_tgt_start_fail, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD12: rebuild send objects failed", rebuild_send_objects_fail, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD13: rebuild empty pool offline", rebuild_offline_empty, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD14: rebuild no space failure", rebuild_nospace, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD15: rebuild multiple tgts", rebuild_multiple_tgts, rebuild_small_sub_setup, + rebuild_sub_teardown}, + {"REBUILD16: disconnect pool during scan", rebuild_tgt_pool_disconnect_in_scan, + rebuild_small_sub_setup, rebuild_sub_teardown}, + {"REBUILD17: disconnect pool during rebuild", rebuild_tgt_pool_disconnect_in_rebuild, + rebuild_small_sub_setup, rebuild_sub_teardown}, + {"REBUILD18: multi-pools rebuild concurrently", multi_pools_rebuild_concurrently, + rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD19: rebuild with master change during scan", rebuild_master_change_during_scan, + rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD20: rebuild with master change during rebuild", rebuild_master_change_during_rebuild, + rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD21: rebuild with master failure", rebuild_master_failure, rebuild_sub_setup, + rebuild_sub_teardown}, + {"REBUILD22: connect pool during scan for offline rebuild", + rebuild_offline_pool_connect_in_scan, rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD23: connect pool during rebuild for offline rebuild", + rebuild_offline_pool_connect_in_rebuild, rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD24: offline rebuild", rebuild_offline, rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD25: rebuild with two failures", rebuild_multiple_failures, rebuild_sub_setup, + rebuild_sub_teardown}, + {"REBUILD26: rebuild fail all replicas before rebuild", + rebuild_fail_all_replicas_before_rebuild, rebuild_sub_rf1_setup, rebuild_sub_teardown}, + {"REBUILD27: rebuild fail all replicas", rebuild_fail_all_replicas, rebuild_sub_setup, + rebuild_sub_teardown}, + {"REBUILD28: rebuild kill rank during rebuild", rebuild_kill_rank_during_rebuild, + rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD29: rebuild kill PS leader during rebuild", rebuild_kill_PS_leader_during_rebuild, + rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD30: destroy pool during rebuild failure and retry", + rebuild_pool_destroy_during_rebuild_failure, rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD31: reintegrate failure and retry", reintegrate_failure_and_retry, rebuild_sub_setup, + rebuild_sub_teardown}, + {"REBUILD32: kill more ranks than RF, then reintegrate", rebuild_kill_more_RF_ranks, + rebuild_sub_setup, rebuild_sub_teardown}, + {"REBUILD33: delay rebuild and extend", rebuild_delay_and_reintegrate, rebuild_sub_setup, + rebuild_sub_teardown}, + {"REBUILD34: delay rebuild and extend", rebuild_delay_and_extend, rebuild_sub_6nodes_rf1_setup, + rebuild_sub_teardown}, + {"REBUILD35: destroy container then reintegrate", rebuild_cont_destroy_and_reintegrate, + rebuild_sub_6nodes_rf1_setup, rebuild_sub_teardown}, + {"REBUILD36: single engine scan lengthy hang", rebuild_long_scan_hang, rebuild_sub_setup, + rebuild_sub_teardown}, }; /* TODO: Enable aggregation once stable view rebuild is done. */