daos-stack · Nasf-Fan · Dec 26, 2023 · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024
@@ -118,7 +118,7 @@ daos_eqx2eq(struct daos_eq_private *eqx)
  * next test case due to dirty ev_thpriv status.
  */
 int
-daos_event_priv_reset(void);
+daos_event_priv_reset(bool force);
 
 /**
  * Retrieve the private per-thread event

@@ -1057,13 +1057,8 @@ daos_event_init(struct daos_event *ev, daos_handle_t eqh,
 	return rc;
 }
 
-/**
- * Unlink events from various list, parent_list, child list,
- * and event queue hash list, and destroy all of the child
- * events
- **/
-int
-daos_event_fini(struct daos_event *ev)
+static int
+daos_event_fini_internal(struct daos_event *ev, bool force)
 {
 	struct daos_event_private	*evx = daos_ev2evx(ev);
 	struct daos_eq_private		*eqx = NULL;
@@ -1080,7 +1075,7 @@ daos_event_fini(struct daos_event *ev)
 		D_MUTEX_LOCK(&eqx->eqx_lock);
 	}
 
-	if (evx->evx_status == DAOS_EVS_RUNNING) {
+	if (evx->evx_status == DAOS_EVS_RUNNING && !force) {
 		rc = -DER_BUSY;
 		goto out;
 	}
@@ -1095,14 +1090,9 @@ daos_event_fini(struct daos_event *ev)
 
 		tmp = d_list_entry(evx->evx_child.next,
 				   struct daos_event_private, evx_link);
-		D_ASSERTF(tmp->evx_status == DAOS_EVS_READY ||
-			  tmp->evx_status == DAOS_EVS_COMPLETED ||
-			  tmp->evx_status == DAOS_EVS_ABORTED,
-			 "EV %p status: %d\n", tmp, tmp->evx_status);
-
 		if (tmp->evx_status != DAOS_EVS_READY &&
 		    tmp->evx_status != DAOS_EVS_COMPLETED &&
-		    tmp->evx_status != DAOS_EVS_ABORTED) {
+		    tmp->evx_status != DAOS_EVS_ABORTED && !force) {
 			D_ERROR("Child event %p launched: %d\n", daos_evx2ev(tmp), tmp->evx_status);
 			rc = -DER_INVAL;
 			goto out;
@@ -1111,17 +1101,14 @@ daos_event_fini(struct daos_event *ev)
 		if (eqx != NULL)
 			D_MUTEX_UNLOCK(&eqx->eqx_lock);
 
-		rc = daos_event_fini(daos_evx2ev(tmp));
+		rc = daos_event_fini_internal(daos_evx2ev(tmp), force);
 		if (rc < 0) {
 			D_ERROR("Failed to finalize child event "DF_RC"\n", DP_RC(rc));
 			goto out_unlocked;
 		}
 
 		if (eqx != NULL)
 			D_MUTEX_LOCK(&eqx->eqx_lock);
-
-		tmp->evx_status = DAOS_EVS_READY;
-		tmp->evx_parent = NULL;
 	}
 
 	/* If it is a child event, delete it from parent list */
@@ -1132,7 +1119,7 @@ daos_event_fini(struct daos_event *ev)
 			goto out;
 		}
 
-		if (evx->evx_parent->evx_status != DAOS_EVS_READY) {
+		if (evx->evx_parent->evx_status != DAOS_EVS_READY && !force) {
 			D_ERROR("Parent event not init or launched: %d\n",
 				evx->evx_parent->evx_status);
 			rc = -DER_INVAL;
@@ -1142,13 +1129,14 @@ daos_event_fini(struct daos_event *ev)
 		d_list_del_init(&evx->evx_link);
 		evx->evx_status = DAOS_EVS_READY;
 		evx->evx_parent = NULL;
-		evx->evx_ctx = NULL;
 	}
 
 	/* Remove from the evx_link */
 	if (!d_list_empty(&evx->evx_link)) {
 		d_list_del(&evx->evx_link);
-		D_ASSERT(evx->evx_status != DAOS_EVS_RUNNING);
+
+		if (!force)
+			D_ASSERT(evx->evx_status != DAOS_EVS_RUNNING);
 
 		if (evx->evx_status == DAOS_EVS_COMPLETED && eq != NULL) {
 			D_ASSERTF(eq->eq_n_comp > 0, "eq %p\n", eq);
@@ -1166,6 +1154,17 @@ daos_event_fini(struct daos_event *ev)
 	return rc;
 }
 
+/**
+ * Unlink events from various list, parent_list, child list,
+ * and event queue hash list, and destroy all of the child
+ * events
+ **/
+int
+daos_event_fini(struct daos_event *ev)
+{
+	return daos_event_fini_internal(ev, false);
+}
+
 struct daos_event *
 daos_event_next(struct daos_event *parent,
 		struct daos_event *child)
@@ -1222,12 +1221,12 @@ daos_event_abort(struct daos_event *ev)
 }
 
 int
-daos_event_priv_reset(void)
+daos_event_priv_reset(bool force)
 {
 	int rc;
 
 	if (ev_thpriv_is_init) {
-		rc = daos_event_fini(&ev_thpriv);
+		rc = daos_event_fini_internal(&ev_thpriv, force);
 		if (rc) {
 			D_ERROR("Failed to finalize thread private event "DF_RC"\n", DP_RC(rc));
 			return rc;
@@ -1310,7 +1309,7 @@ daos_event_priv_wait()
 	/** on success, the event should have been reset to ready stat by the progress cb */
 	if (rc == 0)
 		D_ASSERT(evx->evx_status == DAOS_EVS_READY);
-	rc2 = daos_event_priv_reset();
+	rc2 = daos_event_priv_reset(false);
 	if (rc2) {
 		if (rc == 0)
 			rc = rc2;

diff --git a/src/include/daos/common.h b/src/include/daos/common.h
@@ -831,6 +831,8 @@ enum {
 #define DAOS_DTX_RESYNC_DELAY		(DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x4a)
 #define DAOS_DTX_FAIL_COMMIT		(DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x4b)
 
+#define DAOS_VC_SYNC_CORRUPTION		(DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x4c)
+
 #define DAOS_NVME_FAULTY		(DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x50)
 #define DAOS_NVME_WRITE_ERR		(DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x51)
 #define DAOS_NVME_READ_ERR		(DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x52)

@@ -7672,6 +7672,12 @@ dc_obj_sync(tse_task_t *task)
 		D_GOTO(out_task, rc);
 	}
 
+	if (DAOS_FAIL_CHECK(DAOS_VC_SYNC_CORRUPTION)) {
+		/* It will trigger SIGFPE to simulate corruption. */
+		rc = args->epoch / *args->nr;
+		D_ASSERTF(rc < 0, "Unexpected result %d\n", rc);
+	}
+
 	obj_auxi->spec_shard = 0;
 	obj_auxi->spec_group = 0;
 

diff --git a/src/tests/suite/daos_base_tx.c b/src/tests/suite/daos_base_tx.c
@@ -704,6 +704,9 @@ dtx_resend_delay(test_arg_t *arg, daos_oclass_id_t oclass)
 	daos_fail_loc_set(0);
 	dtx_set_fail_loc(arg, 0);
 
+	/* Wait for the former delayed RPC before destroying the container to avoid DER_BUSY. */
+	sleep(2);
+
 	D_FREE(update_buf);
 	D_FREE(fetch_buf);
 	ioreq_fini(&req);
@@ -941,9 +944,9 @@ static const struct CMUnitTest dtx_tests[] = {
 	{"DTX19: DTX resend during bulk data transfer - multiple reps",
 	 dtx_19, NULL, test_case_teardown},
 	{"DTX20: race between DTX refresh and DTX resync",
-	 dtx_20, dtx_base_rf1_setup, test_case_teardown},
+	 dtx_20, dtx_base_rf1_setup, rebuild_sub_teardown},
 	{"DTX21: do not abort partially committed DTX",
-	 dtx_21, dtx_base_rf0_setup, test_case_teardown},
+	 dtx_21, dtx_base_rf0_setup, rebuild_sub_teardown},
 };
 
 static int

diff --git a/src/tests/suite/daos_test.h b/src/tests/suite/daos_test.h
@@ -20,6 +20,7 @@
 #include <linux/limits.h>
 #include <sys/stat.h>
 #include <dirent.h>
+#include <signal.h>
 
 #include <cmocka.h>
 
@@ -66,7 +67,7 @@ extern char *test_io_dir;
 /* the IO conf file*/
 extern const char *test_io_conf;
 
-extern int daos_event_priv_reset(void);
+extern int daos_event_priv_reset(bool force);
 #define TEST_RANKS_MAX_NUM	(13)
 #define DAOS_SERVER_CONF	"/etc/daos/daos_server.yml"
 #define DAOS_SERVER_CONF_LENGTH		512
@@ -306,7 +307,35 @@ async_overlap(void **state)
 static inline int
 test_case_teardown(void **state)
 {
-	assert_rc_equal(daos_event_priv_reset(), 0);
+	char		*str = NULL;
+	sigset_t	 sigset;
+	bool		 force = false;
+
+	/*
+	 * If one of SIGFPE/SIGILL/SIGSEGV/SIGBUS/SIGSYS is in the signal mask, then the logic is
+	 * longjump from cmocka for handling the signal, then need force cleanup test environment.
+	 */
+	if (sigprocmask(0, NULL, &sigset) < 0) {
+		print_message("sigprocmask failure\n");
+	} else {
+		if (unlikely(sigismember(&sigset, SIGFPE)))
+			str = "SIGFPE";
+		else if (unlikely(sigismember(&sigset, SIGILL)))
+			str = "SIGILL";
+		else if (unlikely(sigismember(&sigset, SIGSEGV)))
+			str = "SIGSEGV";
+		else if (unlikely(sigismember(&sigset, SIGBUS)))
+			str = "SIGBUS";
+		else if (unlikely(sigismember(&sigset, SIGSYS)))
+			str = "SIGSYS";
+
+		if (str != NULL) {
+			print_message("Hit corruption (%s), cleanup by force\n", str);
+			force = true;
+		}
+	}
+
+	assert_rc_equal(daos_event_priv_reset(force), 0);
 	return 0;
 }
 

diff --git a/src/tests/suite/daos_verify_consistency.c b/src/tests/suite/daos_verify_consistency.c
@@ -334,8 +334,44 @@ vc_8(void **state)
 	ioreq_fini(&req);
 }
 
+static inline int
+vc_test_teardown(void **state)
+{
+	daos_fail_loc_set(0);
+
+	return test_case_teardown(state);
+}
+
 static void
 vc_9(void **state)
+{
+	test_arg_t	*arg = *state;
+	daos_obj_id_t	 oid;
+	struct ioreq	 req;
+
+	FAULT_INJECTION_REQUIRED();
+
+	print_message("sync corruption during verify\n");
+
+	if (!test_runable(arg, dts_vc_replica_cnt))
+		return;
+
+	oid = daos_test_oid_gen(arg->coh, dts_vc_class, 0, 0, arg->myrank);
+	ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg);
+
+	vc_gen_modifications(arg, &req, oid, 7, 7, 7, 0, 0, 0);
+
+	daos_fail_loc_set(DAOS_VC_SYNC_CORRUPTION | DAOS_FAIL_ONCE);
+
+	/* Do not care about consistency, just verify the cleanup logic after corruption. */
+	vc_obj_verify(arg, oid);
+
+	daos_fail_loc_set(0);
+	ioreq_fini(&req);
+}
+
+static void
+vc_10(void **state)
 {
 	test_arg_t	*arg = *state;
 	daos_obj_id_t	 oid;
@@ -378,8 +414,10 @@ static const struct CMUnitTest vc_tests[] = {
 	 vc_7, NULL, test_case_teardown},
 	{"VC8: verify with lost replica",
 	 vc_8, NULL, test_case_teardown},
-	{"VC9: verify with different dkey",
-	 vc_9, NULL, test_case_teardown},
+	{"VC9: sync corruption during verify",
+	 vc_9, NULL, vc_test_teardown},
+	{"VC10: verify with different dkey",
+	 vc_10, NULL, test_case_teardown},
 };
 
 static int