Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-14725 client: force cleanup event query when test teardown #13509

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/client/api/client_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ daos_eqx2eq(struct daos_eq_private *eqx)
* next test case due to dirty ev_thpriv status.
*/
int
daos_event_priv_reset(void);
daos_event_priv_reset(bool force);

/**
* Retrieve the private per-thread event
Expand Down
47 changes: 23 additions & 24 deletions src/client/api/event.c
Original file line number Diff line number Diff line change
Expand Up @@ -1057,13 +1057,8 @@ daos_event_init(struct daos_event *ev, daos_handle_t eqh,
return rc;
}

/**
* Unlink events from various list, parent_list, child list,
* and event queue hash list, and destroy all of the child
* events
**/
int
daos_event_fini(struct daos_event *ev)
static int
daos_event_fini_internal(struct daos_event *ev, bool force)
{
struct daos_event_private *evx = daos_ev2evx(ev);
struct daos_eq_private *eqx = NULL;
Expand All @@ -1080,7 +1075,7 @@ daos_event_fini(struct daos_event *ev)
D_MUTEX_LOCK(&eqx->eqx_lock);
}

if (evx->evx_status == DAOS_EVS_RUNNING) {
if (evx->evx_status == DAOS_EVS_RUNNING && !force) {
rc = -DER_BUSY;
goto out;
}
Expand All @@ -1095,14 +1090,9 @@ daos_event_fini(struct daos_event *ev)

tmp = d_list_entry(evx->evx_child.next,
struct daos_event_private, evx_link);
D_ASSERTF(tmp->evx_status == DAOS_EVS_READY ||
tmp->evx_status == DAOS_EVS_COMPLETED ||
tmp->evx_status == DAOS_EVS_ABORTED,
"EV %p status: %d\n", tmp, tmp->evx_status);

if (tmp->evx_status != DAOS_EVS_READY &&
tmp->evx_status != DAOS_EVS_COMPLETED &&
tmp->evx_status != DAOS_EVS_ABORTED) {
tmp->evx_status != DAOS_EVS_ABORTED && !force) {
D_ERROR("Child event %p launched: %d\n", daos_evx2ev(tmp), tmp->evx_status);
rc = -DER_INVAL;
goto out;
Expand All @@ -1111,17 +1101,14 @@ daos_event_fini(struct daos_event *ev)
if (eqx != NULL)
D_MUTEX_UNLOCK(&eqx->eqx_lock);

rc = daos_event_fini(daos_evx2ev(tmp));
rc = daos_event_fini_internal(daos_evx2ev(tmp), force);
if (rc < 0) {
D_ERROR("Failed to finalize child event "DF_RC"\n", DP_RC(rc));
goto out_unlocked;
}

if (eqx != NULL)
D_MUTEX_LOCK(&eqx->eqx_lock);

tmp->evx_status = DAOS_EVS_READY;
tmp->evx_parent = NULL;
}

/* If it is a child event, delete it from parent list */
Expand All @@ -1132,7 +1119,7 @@ daos_event_fini(struct daos_event *ev)
goto out;
}

if (evx->evx_parent->evx_status != DAOS_EVS_READY) {
if (evx->evx_parent->evx_status != DAOS_EVS_READY && !force) {
D_ERROR("Parent event not init or launched: %d\n",
evx->evx_parent->evx_status);
rc = -DER_INVAL;
Expand All @@ -1142,13 +1129,14 @@ daos_event_fini(struct daos_event *ev)
d_list_del_init(&evx->evx_link);
evx->evx_status = DAOS_EVS_READY;
evx->evx_parent = NULL;
evx->evx_ctx = NULL;
}

/* Remove from the evx_link */
if (!d_list_empty(&evx->evx_link)) {
d_list_del(&evx->evx_link);
D_ASSERT(evx->evx_status != DAOS_EVS_RUNNING);

if (!force)
D_ASSERT(evx->evx_status != DAOS_EVS_RUNNING);

if (evx->evx_status == DAOS_EVS_COMPLETED && eq != NULL) {
D_ASSERTF(eq->eq_n_comp > 0, "eq %p\n", eq);
Expand All @@ -1166,6 +1154,17 @@ daos_event_fini(struct daos_event *ev)
return rc;
}

/**
* Unlink events from various list, parent_list, child list,
* and event queue hash list, and destroy all of the child
* events
**/
int
daos_event_fini(struct daos_event *ev)
{
return daos_event_fini_internal(ev, false);
}

struct daos_event *
daos_event_next(struct daos_event *parent,
struct daos_event *child)
Expand Down Expand Up @@ -1222,12 +1221,12 @@ daos_event_abort(struct daos_event *ev)
}

int
daos_event_priv_reset(void)
daos_event_priv_reset(bool force)
{
int rc;

if (ev_thpriv_is_init) {
rc = daos_event_fini(&ev_thpriv);
rc = daos_event_fini_internal(&ev_thpriv, force);
if (rc) {
D_ERROR("Failed to finalize thread private event "DF_RC"\n", DP_RC(rc));
return rc;
Expand Down Expand Up @@ -1310,7 +1309,7 @@ daos_event_priv_wait()
/** on success, the event should have been reset to ready stat by the progress cb */
if (rc == 0)
D_ASSERT(evx->evx_status == DAOS_EVS_READY);
rc2 = daos_event_priv_reset();
rc2 = daos_event_priv_reset(false);
if (rc2) {
if (rc == 0)
rc = rc2;
Expand Down
2 changes: 2 additions & 0 deletions src/include/daos/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -831,6 +831,8 @@ enum {
#define DAOS_DTX_RESYNC_DELAY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x4a)
#define DAOS_DTX_FAIL_COMMIT (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x4b)

#define DAOS_VC_SYNC_CORRUPTION (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x4c)

#define DAOS_NVME_FAULTY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x50)
#define DAOS_NVME_WRITE_ERR (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x51)
#define DAOS_NVME_READ_ERR (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x52)
Expand Down
6 changes: 6 additions & 0 deletions src/object/cli_obj.c
Original file line number Diff line number Diff line change
Expand Up @@ -7672,6 +7672,12 @@ dc_obj_sync(tse_task_t *task)
D_GOTO(out_task, rc);
}

if (DAOS_FAIL_CHECK(DAOS_VC_SYNC_CORRUPTION)) {
/* It will trigger SIGFPE to simulate corruption. */
rc = args->epoch / *args->nr;
D_ASSERTF(rc < 0, "Unexpected result %d\n", rc);
}

obj_auxi->spec_shard = 0;
obj_auxi->spec_group = 0;

Expand Down
7 changes: 5 additions & 2 deletions src/tests/suite/daos_base_tx.c
Original file line number Diff line number Diff line change
Expand Up @@ -704,6 +704,9 @@ dtx_resend_delay(test_arg_t *arg, daos_oclass_id_t oclass)
daos_fail_loc_set(0);
dtx_set_fail_loc(arg, 0);

/* Wait for the former delayed RPC before destroying the container to avoid DER_BUSY. */
sleep(2);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you please explain why 2 sec sleep is needed here? why not 1 or 10?
does it vary if we change some CI hardware?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We injected some fail_loc to make related IO handler to delay process the IO request about 1 second later after client RPC timeout. Here, we sleep additional 1 second to guarantee that related IO handler has already responded.


D_FREE(update_buf);
D_FREE(fetch_buf);
ioreq_fini(&req);
Expand Down Expand Up @@ -941,9 +944,9 @@ static const struct CMUnitTest dtx_tests[] = {
{"DTX19: DTX resend during bulk data transfer - multiple reps",
dtx_19, NULL, test_case_teardown},
{"DTX20: race between DTX refresh and DTX resync",
dtx_20, dtx_base_rf1_setup, test_case_teardown},
dtx_20, dtx_base_rf1_setup, rebuild_sub_teardown},
{"DTX21: do not abort partially committed DTX",
dtx_21, dtx_base_rf0_setup, test_case_teardown},
dtx_21, dtx_base_rf0_setup, rebuild_sub_teardown},
};

static int
Expand Down
33 changes: 31 additions & 2 deletions src/tests/suite/daos_test.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <linux/limits.h>
#include <sys/stat.h>
#include <dirent.h>
#include <signal.h>

#include <cmocka.h>

Expand Down Expand Up @@ -66,7 +67,7 @@ extern char *test_io_dir;
/* the IO conf file*/
extern const char *test_io_conf;

extern int daos_event_priv_reset(void);
extern int daos_event_priv_reset(bool force);
#define TEST_RANKS_MAX_NUM (13)
#define DAOS_SERVER_CONF "/etc/daos/daos_server.yml"
#define DAOS_SERVER_CONF_LENGTH 512
Expand Down Expand Up @@ -306,7 +307,35 @@ async_overlap(void **state)
static inline int
test_case_teardown(void **state)
{
assert_rc_equal(daos_event_priv_reset(), 0);
char *str = NULL;
sigset_t sigset;
bool force = false;

/*
* If one of SIGFPE/SIGILL/SIGSEGV/SIGBUS/SIGSYS is in the signal mask, then the logic is
* longjump from cmocka for handling the signal, then need force cleanup test environment.
*/
if (sigprocmask(0, NULL, &sigset) < 0) {
print_message("sigprocmask failure\n");
} else {
Copy link
Contributor Author

@Nasf-Fan Nasf-Fan Jan 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@liuxuezhao @mchaarawi , sorry, I made some mistake in former comment. We cannot distinguish whether the test_teardown() is call from normal test complete routine or from cmocka signal handler longjmp. Because after signal handler longjmp, its signal mask has already been recovered, means that the signal that triggered the exception has been dropped when come here. So we cannot use the signal mask to distinguish the context unless before calling siglongjmp().

By registering my own signal handler, I can simulate the exception and trigger teardown() by force. That proves the teardown() by force can work (and subsequent test can go ahead without failure). But because we cannot distinguish the context without our own signal hander, then whether still keep the teardown() by force? What's your suggestion?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ping @liuxuezhao && @mchaarawi , what's your suggestion about above comment? Thanks!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you mean in this PR it actually cannot distinguish whether or not should force daos_event_priv_reset()? if so not very sure what's the benefit to do so?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We cannot distinguish whether the caller is from signal handler caused long jump or from regular test cleanup logic. So if consider @mchaarawi 's concern about potentially hiding bug when daos_event_priv_reset() by force, then we may have to drop the changes for daos_event_priv_reset() with "force" parameter. Then without such fixes, we have very few things to do for this ticket. Here is the simplified patch in another PR:
#13596.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the other PR looks much cleaner to me for this issue.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@liuxuezhao , are you fine with another PR? if yes, I will replace this one with it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am fine to go with the other PR first, thanks.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Nasf-Fan "By registering my own signal handler, I can simulate the exception and trigger teardown() by force", if register signal handler in test_setup, can it get notification if got those abnormal signal? maybe it can be useful for daos_test to catch the error. just FYI.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean that by registering my own signal RPC handler, the daos_event_priv_reset() can distinguish whether the the caller is from signal handler or normal exit via some hack (but not suggested) flag, then I can verify the force cleanup logic works.

But because the corruption can happen during any test case, then we need to register related signal handlers to replace CMOCKA registered ones for almost all of our test cases when setup, that will cause a lot of code lines changes. Not sure whether it is worth or not.

On the other hand, which signal will be handled by CMOCKA is blind for DAOS, consider CMOCKA potential upgrade, then we need to trace CMOCKA changes (for signal handler) via new configuration. In theory, we can do that, but whether worth or not only for test cleanup.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, that seems quite complex and hack, maybe not worth to do that. thanks for the explanation.

if (unlikely(sigismember(&sigset, SIGFPE)))
str = "SIGFPE";
else if (unlikely(sigismember(&sigset, SIGILL)))
str = "SIGILL";
else if (unlikely(sigismember(&sigset, SIGSEGV)))
str = "SIGSEGV";
else if (unlikely(sigismember(&sigset, SIGBUS)))
str = "SIGBUS";
else if (unlikely(sigismember(&sigset, SIGSYS)))
str = "SIGSYS";

if (str != NULL) {
print_message("Hit corruption (%s), cleanup by force\n", str);
force = true;
}
}

assert_rc_equal(daos_event_priv_reset(force), 0);
return 0;
}

Expand Down
42 changes: 40 additions & 2 deletions src/tests/suite/daos_verify_consistency.c
Original file line number Diff line number Diff line change
Expand Up @@ -334,8 +334,44 @@ vc_8(void **state)
ioreq_fini(&req);
}

static inline int
vc_test_teardown(void **state)
{
daos_fail_loc_set(0);

return test_case_teardown(state);
}

static void
vc_9(void **state)
{
test_arg_t *arg = *state;
daos_obj_id_t oid;
struct ioreq req;

FAULT_INJECTION_REQUIRED();

print_message("sync corruption during verify\n");

if (!test_runable(arg, dts_vc_replica_cnt))
return;

oid = daos_test_oid_gen(arg->coh, dts_vc_class, 0, 0, arg->myrank);
ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg);

vc_gen_modifications(arg, &req, oid, 7, 7, 7, 0, 0, 0);

daos_fail_loc_set(DAOS_VC_SYNC_CORRUPTION | DAOS_FAIL_ONCE);

/* Do not care about consistency, just verify the cleanup logic after corruption. */
vc_obj_verify(arg, oid);

daos_fail_loc_set(0);
ioreq_fini(&req);
}

static void
vc_10(void **state)
{
test_arg_t *arg = *state;
daos_obj_id_t oid;
Expand Down Expand Up @@ -378,8 +414,10 @@ static const struct CMUnitTest vc_tests[] = {
vc_7, NULL, test_case_teardown},
{"VC8: verify with lost replica",
vc_8, NULL, test_case_teardown},
{"VC9: verify with different dkey",
vc_9, NULL, test_case_teardown},
{"VC9: sync corruption during verify",
vc_9, NULL, vc_test_teardown},
{"VC10: verify with different dkey",
vc_10, NULL, test_case_teardown},
};

static int
Expand Down