From f4488bdd4034b6a2e630dcb9b829d23e17e0f720 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Tue, 26 Dec 2023 22:47:16 +0800 Subject: [PATCH 1/5] DAOS-14725 client: force cleanup event query when test teardown For the test cases that are driven by CMOCKA, the test_case_teardown() may be triggered by some signal that is captured by CMOCKA registered signal handler. Under such case, related logic has long jumped out of original DAOS lower level (object or cart) context. Related tasks may be still in RUNNING status, but they are not runnable any longer even if re-scheduled. It means that we may lost some resources attached to related tasks, that is not fatal. They can be reclaimed automatically when current test process exits. What we can do for test cleanup is that try to reset "ev_thpriv" that will be reused for subsequent test cases. Otherwise, one test failure may block all subsequent test cases, that is too bad. For such purpose we introduce "force" parameter for DAOS internal API daos_event_priv_reset() to cleanup the shared event queue by force. The patch also contains some other test environment cleanup. Required-githooks: true Signed-off-by: Fan Yong --- src/client/api/client_internal.h | 2 +- src/client/api/event.c | 47 ++++++++++++++++---------------- src/tests/suite/daos_base_tx.c | 7 +++-- src/tests/suite/daos_test.h | 4 +-- 4 files changed, 31 insertions(+), 29 deletions(-) diff --git a/src/client/api/client_internal.h b/src/client/api/client_internal.h index cfbca3afa68..e6f94faf806 100644 --- a/src/client/api/client_internal.h +++ b/src/client/api/client_internal.h @@ -118,7 +118,7 @@ daos_eqx2eq(struct daos_eq_private *eqx) * next test case due to dirty ev_thpriv status. */ int -daos_event_priv_reset(void); +daos_event_priv_reset(bool force); /** * Retrieve the private per-thread event diff --git a/src/client/api/event.c b/src/client/api/event.c index 2ab910ac7bf..b584776dfa2 100644 --- a/src/client/api/event.c +++ b/src/client/api/event.c @@ -1057,13 +1057,8 @@ daos_event_init(struct daos_event *ev, daos_handle_t eqh, return rc; } -/** - * Unlink events from various list, parent_list, child list, - * and event queue hash list, and destroy all of the child - * events - **/ -int -daos_event_fini(struct daos_event *ev) +static int +daos_event_fini_internal(struct daos_event *ev, bool force) { struct daos_event_private *evx = daos_ev2evx(ev); struct daos_eq_private *eqx = NULL; @@ -1080,7 +1075,7 @@ daos_event_fini(struct daos_event *ev) D_MUTEX_LOCK(&eqx->eqx_lock); } - if (evx->evx_status == DAOS_EVS_RUNNING) { + if (evx->evx_status == DAOS_EVS_RUNNING && !force) { rc = -DER_BUSY; goto out; } @@ -1095,14 +1090,9 @@ daos_event_fini(struct daos_event *ev) tmp = d_list_entry(evx->evx_child.next, struct daos_event_private, evx_link); - D_ASSERTF(tmp->evx_status == DAOS_EVS_READY || - tmp->evx_status == DAOS_EVS_COMPLETED || - tmp->evx_status == DAOS_EVS_ABORTED, - "EV %p status: %d\n", tmp, tmp->evx_status); - if (tmp->evx_status != DAOS_EVS_READY && tmp->evx_status != DAOS_EVS_COMPLETED && - tmp->evx_status != DAOS_EVS_ABORTED) { + tmp->evx_status != DAOS_EVS_ABORTED && !force) { D_ERROR("Child event %p launched: %d\n", daos_evx2ev(tmp), tmp->evx_status); rc = -DER_INVAL; goto out; @@ -1111,7 +1101,7 @@ daos_event_fini(struct daos_event *ev) if (eqx != NULL) D_MUTEX_UNLOCK(&eqx->eqx_lock); - rc = daos_event_fini(daos_evx2ev(tmp)); + rc = daos_event_fini_internal(daos_evx2ev(tmp), force); if (rc < 0) { D_ERROR("Failed to finalize child event "DF_RC"\n", DP_RC(rc)); goto out_unlocked; @@ -1119,9 +1109,6 @@ daos_event_fini(struct daos_event *ev) if (eqx != NULL) D_MUTEX_LOCK(&eqx->eqx_lock); - - tmp->evx_status = DAOS_EVS_READY; - tmp->evx_parent = NULL; } /* If it is a child event, delete it from parent list */ @@ -1132,7 +1119,7 @@ daos_event_fini(struct daos_event *ev) goto out; } - if (evx->evx_parent->evx_status != DAOS_EVS_READY) { + if (evx->evx_parent->evx_status != DAOS_EVS_READY && !force) { D_ERROR("Parent event not init or launched: %d\n", evx->evx_parent->evx_status); rc = -DER_INVAL; @@ -1142,13 +1129,14 @@ daos_event_fini(struct daos_event *ev) d_list_del_init(&evx->evx_link); evx->evx_status = DAOS_EVS_READY; evx->evx_parent = NULL; - evx->evx_ctx = NULL; } /* Remove from the evx_link */ if (!d_list_empty(&evx->evx_link)) { d_list_del(&evx->evx_link); - D_ASSERT(evx->evx_status != DAOS_EVS_RUNNING); + + if (!force) + D_ASSERT(evx->evx_status != DAOS_EVS_RUNNING); if (evx->evx_status == DAOS_EVS_COMPLETED && eq != NULL) { D_ASSERTF(eq->eq_n_comp > 0, "eq %p\n", eq); @@ -1166,6 +1154,17 @@ daos_event_fini(struct daos_event *ev) return rc; } +/** + * Unlink events from various list, parent_list, child list, + * and event queue hash list, and destroy all of the child + * events + **/ +int +daos_event_fini(struct daos_event *ev) +{ + return daos_event_fini_internal(ev, false); +} + struct daos_event * daos_event_next(struct daos_event *parent, struct daos_event *child) @@ -1222,12 +1221,12 @@ daos_event_abort(struct daos_event *ev) } int -daos_event_priv_reset(void) +daos_event_priv_reset(bool force) { int rc; if (ev_thpriv_is_init) { - rc = daos_event_fini(&ev_thpriv); + rc = daos_event_fini_internal(&ev_thpriv, force); if (rc) { D_ERROR("Failed to finalize thread private event "DF_RC"\n", DP_RC(rc)); return rc; @@ -1310,7 +1309,7 @@ daos_event_priv_wait() /** on success, the event should have been reset to ready stat by the progress cb */ if (rc == 0) D_ASSERT(evx->evx_status == DAOS_EVS_READY); - rc2 = daos_event_priv_reset(); + rc2 = daos_event_priv_reset(false); if (rc2) { if (rc == 0) rc = rc2; diff --git a/src/tests/suite/daos_base_tx.c b/src/tests/suite/daos_base_tx.c index 7800ed7d2c9..d4d30ef7b05 100644 --- a/src/tests/suite/daos_base_tx.c +++ b/src/tests/suite/daos_base_tx.c @@ -704,6 +704,9 @@ dtx_resend_delay(test_arg_t *arg, daos_oclass_id_t oclass) daos_fail_loc_set(0); dtx_set_fail_loc(arg, 0); + /* Wait for the former delayed RPC before destroying the container to avoid DER_BUSY. */ + sleep(2); + D_FREE(update_buf); D_FREE(fetch_buf); ioreq_fini(&req); @@ -941,9 +944,9 @@ static const struct CMUnitTest dtx_tests[] = { {"DTX19: DTX resend during bulk data transfer - multiple reps", dtx_19, NULL, test_case_teardown}, {"DTX20: race between DTX refresh and DTX resync", - dtx_20, dtx_base_rf1_setup, test_case_teardown}, + dtx_20, dtx_base_rf1_setup, rebuild_sub_teardown}, {"DTX21: do not abort partially committed DTX", - dtx_21, dtx_base_rf0_setup, test_case_teardown}, + dtx_21, dtx_base_rf0_setup, rebuild_sub_teardown}, }; static int diff --git a/src/tests/suite/daos_test.h b/src/tests/suite/daos_test.h index b0c43d5b9f7..a9595e1a368 100644 --- a/src/tests/suite/daos_test.h +++ b/src/tests/suite/daos_test.h @@ -66,7 +66,7 @@ extern char *test_io_dir; /* the IO conf file*/ extern const char *test_io_conf; -extern int daos_event_priv_reset(void); +extern int daos_event_priv_reset(bool force); #define TEST_RANKS_MAX_NUM (13) #define DAOS_SERVER_CONF "/etc/daos/daos_server.yml" #define DAOS_SERVER_CONF_LENGTH 512 @@ -306,7 +306,7 @@ async_overlap(void **state) static inline int test_case_teardown(void **state) { - assert_rc_equal(daos_event_priv_reset(), 0); + assert_rc_equal(daos_event_priv_reset(true), 0); return 0; } From e9a5508c2fcffe856d7f5acc8595f086b05fb072 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Tue, 9 Jan 2024 16:33:59 +0800 Subject: [PATCH 2/5] DAOS-14725 tests: force clenup when longjump from from cmocka Cleanup test environment only when logic is longjump from cmocka for handling signal. Signed-off-by: Fan Yong --- src/tests/suite/daos_test.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/tests/suite/daos_test.h b/src/tests/suite/daos_test.h index a9595e1a368..a3868035705 100644 --- a/src/tests/suite/daos_test.h +++ b/src/tests/suite/daos_test.h @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -306,7 +307,24 @@ async_overlap(void **state) static inline int test_case_teardown(void **state) { - assert_rc_equal(daos_event_priv_reset(true), 0); + sigset_t sigset; + bool force = false; + + /* + * If one of SIGFPE/SIGILL/SIGSEGV/SIGBUS/SIGSYS is in the signal mask, then the logic is + * longjump from cmocka for handling the signal, then need force cleanup test environment. + */ + if (sigprocmask(0, NULL, &sigset) < 0) { + print_message("sigprocmask failure\n"); + } else if (unlikely(sigismember(&sigset, SIGFPE) || + sigismember(&sigset, SIGILL) || + sigismember(&sigset, SIGSEGV) || + sigismember(&sigset, SIGBUS) || + sigismember(&sigset, SIGSYS))) { + force = true; + } + + assert_rc_equal(daos_event_priv_reset(force), 0); return 0; } From cd8e71987dba75ac7a74092b4dc29e2a4c238a50 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Tue, 9 Jan 2024 18:35:54 +0800 Subject: [PATCH 3/5] DAOS-14725 tests: error message when hit test corruption Help to distinguish the issue. Signed-off-by: Fan Yong --- src/tests/suite/daos_test.h | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/src/tests/suite/daos_test.h b/src/tests/suite/daos_test.h index a3868035705..0c62bac69bc 100644 --- a/src/tests/suite/daos_test.h +++ b/src/tests/suite/daos_test.h @@ -307,8 +307,9 @@ async_overlap(void **state) static inline int test_case_teardown(void **state) { - sigset_t sigset; - bool force = false; + char *str = NULL; + sigset_t sigset; + bool force = false; /* * If one of SIGFPE/SIGILL/SIGSEGV/SIGBUS/SIGSYS is in the signal mask, then the logic is @@ -316,12 +317,22 @@ test_case_teardown(void **state) */ if (sigprocmask(0, NULL, &sigset) < 0) { print_message("sigprocmask failure\n"); - } else if (unlikely(sigismember(&sigset, SIGFPE) || - sigismember(&sigset, SIGILL) || - sigismember(&sigset, SIGSEGV) || - sigismember(&sigset, SIGBUS) || - sigismember(&sigset, SIGSYS))) { - force = true; + } else { + if (unlikely(sigismember(&sigset, SIGFPE))) + str = "SIGFPE"; + else if (unlikely(sigismember(&sigset, SIGILL))) + str = "SIGILL"; + else if (unlikely( sigismember(&sigset, SIGSEGV))) + str = "SIGBUS"; + else if (unlikely(sigismember(&sigset, SIGBUS))) + str = "SIGBUS"; + else if (unlikely(sigismember(&sigset, SIGSYS))) + str = "SIGSYS"; + + if (str != NULL) { + print_message("Hit corruption (%s), cleanup by force\n", str); + force = true; + } } assert_rc_equal(daos_event_priv_reset(force), 0); From 0073bae3159783a1dfb2a0c92be5c2f259616478 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Tue, 9 Jan 2024 18:46:46 +0800 Subject: [PATCH 4/5] DAOS-14725 tests: fix typo Signed-off-by: Fan Yong --- src/tests/suite/daos_test.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/suite/daos_test.h b/src/tests/suite/daos_test.h index 0c62bac69bc..036540b940f 100644 --- a/src/tests/suite/daos_test.h +++ b/src/tests/suite/daos_test.h @@ -322,8 +322,8 @@ test_case_teardown(void **state) str = "SIGFPE"; else if (unlikely(sigismember(&sigset, SIGILL))) str = "SIGILL"; - else if (unlikely( sigismember(&sigset, SIGSEGV))) - str = "SIGBUS"; + else if (unlikely(sigismember(&sigset, SIGSEGV))) + str = "SIGSEGV"; else if (unlikely(sigismember(&sigset, SIGBUS))) str = "SIGBUS"; else if (unlikely(sigismember(&sigset, SIGSYS))) From 3ea6c8f56cbd9d628b28cd063d2bc883af51811f Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Thu, 11 Jan 2024 11:18:29 +0800 Subject: [PATCH 5/5] DAOS-14725 tests: test case for cleanup after corruption Trigger SIGFPE signal. Signed-off-by: Fan Yong --- src/include/daos/common.h | 2 ++ src/object/cli_obj.c | 6 ++++ src/tests/suite/daos_verify_consistency.c | 42 +++++++++++++++++++++-- 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/src/include/daos/common.h b/src/include/daos/common.h index 7a3088b53c8..9944fe25bdd 100644 --- a/src/include/daos/common.h +++ b/src/include/daos/common.h @@ -831,6 +831,8 @@ enum { #define DAOS_DTX_RESYNC_DELAY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x4a) #define DAOS_DTX_FAIL_COMMIT (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x4b) +#define DAOS_VC_SYNC_CORRUPTION (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x4c) + #define DAOS_NVME_FAULTY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x50) #define DAOS_NVME_WRITE_ERR (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x51) #define DAOS_NVME_READ_ERR (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x52) diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index 088e87067c4..eb0afa2bb9f 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -7672,6 +7672,12 @@ dc_obj_sync(tse_task_t *task) D_GOTO(out_task, rc); } + if (DAOS_FAIL_CHECK(DAOS_VC_SYNC_CORRUPTION)) { + /* It will trigger SIGFPE to simulate corruption. */ + rc = args->epoch / *args->nr; + D_ASSERTF(rc < 0, "Unexpected result %d\n", rc); + } + obj_auxi->spec_shard = 0; obj_auxi->spec_group = 0; diff --git a/src/tests/suite/daos_verify_consistency.c b/src/tests/suite/daos_verify_consistency.c index f5f03ddc1af..19f0a5a4fa9 100644 --- a/src/tests/suite/daos_verify_consistency.c +++ b/src/tests/suite/daos_verify_consistency.c @@ -334,8 +334,44 @@ vc_8(void **state) ioreq_fini(&req); } +static inline int +vc_test_teardown(void **state) +{ + daos_fail_loc_set(0); + + return test_case_teardown(state); +} + static void vc_9(void **state) +{ + test_arg_t *arg = *state; + daos_obj_id_t oid; + struct ioreq req; + + FAULT_INJECTION_REQUIRED(); + + print_message("sync corruption during verify\n"); + + if (!test_runable(arg, dts_vc_replica_cnt)) + return; + + oid = daos_test_oid_gen(arg->coh, dts_vc_class, 0, 0, arg->myrank); + ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); + + vc_gen_modifications(arg, &req, oid, 7, 7, 7, 0, 0, 0); + + daos_fail_loc_set(DAOS_VC_SYNC_CORRUPTION | DAOS_FAIL_ONCE); + + /* Do not care about consistency, just verify the cleanup logic after corruption. */ + vc_obj_verify(arg, oid); + + daos_fail_loc_set(0); + ioreq_fini(&req); +} + +static void +vc_10(void **state) { test_arg_t *arg = *state; daos_obj_id_t oid; @@ -378,8 +414,10 @@ static const struct CMUnitTest vc_tests[] = { vc_7, NULL, test_case_teardown}, {"VC8: verify with lost replica", vc_8, NULL, test_case_teardown}, - {"VC9: verify with different dkey", - vc_9, NULL, test_case_teardown}, + {"VC9: sync corruption during verify", + vc_9, NULL, vc_test_teardown}, + {"VC10: verify with different dkey", + vc_10, NULL, test_case_teardown}, }; static int