diff --git a/src/client/api/client_internal.h b/src/client/api/client_internal.h index cfbca3afa68..e6f94faf806 100644 --- a/src/client/api/client_internal.h +++ b/src/client/api/client_internal.h @@ -118,7 +118,7 @@ daos_eqx2eq(struct daos_eq_private *eqx) * next test case due to dirty ev_thpriv status. */ int -daos_event_priv_reset(void); +daos_event_priv_reset(bool force); /** * Retrieve the private per-thread event diff --git a/src/client/api/event.c b/src/client/api/event.c index fa31cfd5ac4..279dd086f14 100644 --- a/src/client/api/event.c +++ b/src/client/api/event.c @@ -1057,13 +1057,8 @@ daos_event_init(struct daos_event *ev, daos_handle_t eqh, return rc; } -/** - * Unlink events from various list, parent_list, child list, - * and event queue hash list, and destroy all of the child - * events - **/ -int -daos_event_fini(struct daos_event *ev) +static int +daos_event_fini_internal(struct daos_event *ev, bool force) { struct daos_event_private *evx = daos_ev2evx(ev); struct daos_eq_private *eqx = NULL; @@ -1080,7 +1075,7 @@ daos_event_fini(struct daos_event *ev) D_MUTEX_LOCK(&eqx->eqx_lock); } - if (evx->evx_status == DAOS_EVS_RUNNING) { + if (evx->evx_status == DAOS_EVS_RUNNING && !force) { rc = -DER_BUSY; goto out; } @@ -1095,14 +1090,9 @@ daos_event_fini(struct daos_event *ev) tmp = d_list_entry(evx->evx_child.next, struct daos_event_private, evx_link); - D_ASSERTF(tmp->evx_status == DAOS_EVS_READY || - tmp->evx_status == DAOS_EVS_COMPLETED || - tmp->evx_status == DAOS_EVS_ABORTED, - "EV %p status: %d\n", tmp, tmp->evx_status); - if (tmp->evx_status != DAOS_EVS_READY && tmp->evx_status != DAOS_EVS_COMPLETED && - tmp->evx_status != DAOS_EVS_ABORTED) { + tmp->evx_status != DAOS_EVS_ABORTED && !force) { D_ERROR("Child event %p launched: %d\n", daos_evx2ev(tmp), tmp->evx_status); rc = -DER_INVAL; goto out; @@ -1111,7 +1101,7 @@ daos_event_fini(struct daos_event *ev) if (eqx != NULL) D_MUTEX_UNLOCK(&eqx->eqx_lock); - rc = daos_event_fini(daos_evx2ev(tmp)); + rc = daos_event_fini_internal(daos_evx2ev(tmp), force); if (rc < 0) { D_ERROR("Failed to finalize child event "DF_RC"\n", DP_RC(rc)); goto out_unlocked; @@ -1119,9 +1109,6 @@ daos_event_fini(struct daos_event *ev) if (eqx != NULL) D_MUTEX_LOCK(&eqx->eqx_lock); - - tmp->evx_status = DAOS_EVS_READY; - tmp->evx_parent = NULL; } /* If it is a child event, delete it from parent list */ @@ -1132,7 +1119,7 @@ daos_event_fini(struct daos_event *ev) goto out; } - if (evx->evx_parent->evx_status != DAOS_EVS_READY) { + if (evx->evx_parent->evx_status != DAOS_EVS_READY && !force) { D_ERROR("Parent event not init or launched: %d\n", evx->evx_parent->evx_status); rc = -DER_INVAL; @@ -1142,13 +1129,14 @@ daos_event_fini(struct daos_event *ev) d_list_del_init(&evx->evx_link); evx->evx_status = DAOS_EVS_READY; evx->evx_parent = NULL; - evx->evx_ctx = NULL; } /* Remove from the evx_link */ if (!d_list_empty(&evx->evx_link)) { d_list_del(&evx->evx_link); - D_ASSERT(evx->evx_status != DAOS_EVS_RUNNING); + + if (!force) + D_ASSERT(evx->evx_status != DAOS_EVS_RUNNING); if (evx->evx_status == DAOS_EVS_COMPLETED && eq != NULL) { D_ASSERTF(eq->eq_n_comp > 0, "eq %p\n", eq); @@ -1166,6 +1154,17 @@ daos_event_fini(struct daos_event *ev) return rc; } +/** + * Unlink events from various list, parent_list, child list, + * and event queue hash list, and destroy all of the child + * events + **/ +int +daos_event_fini(struct daos_event *ev) +{ + return daos_event_fini_internal(ev, false); +} + struct daos_event * daos_event_next(struct daos_event *parent, struct daos_event *child) @@ -1222,12 +1221,12 @@ daos_event_abort(struct daos_event *ev) } int -daos_event_priv_reset(void) +daos_event_priv_reset(bool force) { int rc; if (ev_thpriv_is_init) { - rc = daos_event_fini(&ev_thpriv); + rc = daos_event_fini_internal(&ev_thpriv, force); if (rc) { D_ERROR("Failed to finalize thread private event "DF_RC"\n", DP_RC(rc)); return rc; @@ -1310,7 +1309,7 @@ daos_event_priv_wait() /** on success, the event should have been reset to ready stat by the progress cb */ if (rc == 0) D_ASSERT(evx->evx_status == DAOS_EVS_READY); - rc2 = daos_event_priv_reset(); + rc2 = daos_event_priv_reset(false); if (rc2) { if (rc == 0) rc = rc2; diff --git a/src/include/daos/common.h b/src/include/daos/common.h index 7a3088b53c8..9944fe25bdd 100644 --- a/src/include/daos/common.h +++ b/src/include/daos/common.h @@ -831,6 +831,8 @@ enum { #define DAOS_DTX_RESYNC_DELAY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x4a) #define DAOS_DTX_FAIL_COMMIT (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x4b) +#define DAOS_VC_SYNC_CORRUPTION (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x4c) + #define DAOS_NVME_FAULTY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x50) #define DAOS_NVME_WRITE_ERR (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x51) #define DAOS_NVME_READ_ERR (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x52) diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index 088e87067c4..eb0afa2bb9f 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -7672,6 +7672,12 @@ dc_obj_sync(tse_task_t *task) D_GOTO(out_task, rc); } + if (DAOS_FAIL_CHECK(DAOS_VC_SYNC_CORRUPTION)) { + /* It will trigger SIGFPE to simulate corruption. */ + rc = args->epoch / *args->nr; + D_ASSERTF(rc < 0, "Unexpected result %d\n", rc); + } + obj_auxi->spec_shard = 0; obj_auxi->spec_group = 0; diff --git a/src/tests/suite/daos_base_tx.c b/src/tests/suite/daos_base_tx.c index 7800ed7d2c9..d4d30ef7b05 100644 --- a/src/tests/suite/daos_base_tx.c +++ b/src/tests/suite/daos_base_tx.c @@ -704,6 +704,9 @@ dtx_resend_delay(test_arg_t *arg, daos_oclass_id_t oclass) daos_fail_loc_set(0); dtx_set_fail_loc(arg, 0); + /* Wait for the former delayed RPC before destroying the container to avoid DER_BUSY. */ + sleep(2); + D_FREE(update_buf); D_FREE(fetch_buf); ioreq_fini(&req); @@ -941,9 +944,9 @@ static const struct CMUnitTest dtx_tests[] = { {"DTX19: DTX resend during bulk data transfer - multiple reps", dtx_19, NULL, test_case_teardown}, {"DTX20: race between DTX refresh and DTX resync", - dtx_20, dtx_base_rf1_setup, test_case_teardown}, + dtx_20, dtx_base_rf1_setup, rebuild_sub_teardown}, {"DTX21: do not abort partially committed DTX", - dtx_21, dtx_base_rf0_setup, test_case_teardown}, + dtx_21, dtx_base_rf0_setup, rebuild_sub_teardown}, }; static int diff --git a/src/tests/suite/daos_test.h b/src/tests/suite/daos_test.h index 4c8d6e8f31e..c2c8d6a489e 100644 --- a/src/tests/suite/daos_test.h +++ b/src/tests/suite/daos_test.h @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -66,7 +67,7 @@ extern char *test_io_dir; /* the IO conf file*/ extern const char *test_io_conf; -extern int daos_event_priv_reset(void); +extern int daos_event_priv_reset(bool force); #define TEST_RANKS_MAX_NUM (13) #define DAOS_SERVER_CONF "/etc/daos/daos_server.yml" #define DAOS_SERVER_CONF_LENGTH 512 @@ -306,7 +307,35 @@ async_overlap(void **state) static inline int test_case_teardown(void **state) { - assert_rc_equal(daos_event_priv_reset(), 0); + char *str = NULL; + sigset_t sigset; + bool force = false; + + /* + * If one of SIGFPE/SIGILL/SIGSEGV/SIGBUS/SIGSYS is in the signal mask, then the logic is + * longjump from cmocka for handling the signal, then need force cleanup test environment. + */ + if (sigprocmask(0, NULL, &sigset) < 0) { + print_message("sigprocmask failure\n"); + } else { + if (unlikely(sigismember(&sigset, SIGFPE))) + str = "SIGFPE"; + else if (unlikely(sigismember(&sigset, SIGILL))) + str = "SIGILL"; + else if (unlikely(sigismember(&sigset, SIGSEGV))) + str = "SIGSEGV"; + else if (unlikely(sigismember(&sigset, SIGBUS))) + str = "SIGBUS"; + else if (unlikely(sigismember(&sigset, SIGSYS))) + str = "SIGSYS"; + + if (str != NULL) { + print_message("Hit corruption (%s), cleanup by force\n", str); + force = true; + } + } + + assert_rc_equal(daos_event_priv_reset(force), 0); return 0; } diff --git a/src/tests/suite/daos_verify_consistency.c b/src/tests/suite/daos_verify_consistency.c index f5f03ddc1af..19f0a5a4fa9 100644 --- a/src/tests/suite/daos_verify_consistency.c +++ b/src/tests/suite/daos_verify_consistency.c @@ -334,8 +334,44 @@ vc_8(void **state) ioreq_fini(&req); } +static inline int +vc_test_teardown(void **state) +{ + daos_fail_loc_set(0); + + return test_case_teardown(state); +} + static void vc_9(void **state) +{ + test_arg_t *arg = *state; + daos_obj_id_t oid; + struct ioreq req; + + FAULT_INJECTION_REQUIRED(); + + print_message("sync corruption during verify\n"); + + if (!test_runable(arg, dts_vc_replica_cnt)) + return; + + oid = daos_test_oid_gen(arg->coh, dts_vc_class, 0, 0, arg->myrank); + ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); + + vc_gen_modifications(arg, &req, oid, 7, 7, 7, 0, 0, 0); + + daos_fail_loc_set(DAOS_VC_SYNC_CORRUPTION | DAOS_FAIL_ONCE); + + /* Do not care about consistency, just verify the cleanup logic after corruption. */ + vc_obj_verify(arg, oid); + + daos_fail_loc_set(0); + ioreq_fini(&req); +} + +static void +vc_10(void **state) { test_arg_t *arg = *state; daos_obj_id_t oid; @@ -378,8 +414,10 @@ static const struct CMUnitTest vc_tests[] = { vc_7, NULL, test_case_teardown}, {"VC8: verify with lost replica", vc_8, NULL, test_case_teardown}, - {"VC9: verify with different dkey", - vc_9, NULL, test_case_teardown}, + {"VC9: sync corruption during verify", + vc_9, NULL, vc_test_teardown}, + {"VC10: verify with different dkey", + vc_10, NULL, test_case_teardown}, }; static int