From 38c35c5d75906b2c30dadb15984a4370987f3adb Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Sat, 22 Feb 2025 07:52:30 -0500 Subject: [PATCH 1/4] DAOS-16620 vos: Exit scrub earlier during container destruction In sc_wait_until_should_continue(), the function could sleep for over 60 seconds before proceeding, causing container destruction timeouts and DER_BUSY errors during retries. This fix adds 1-second interval checks for sc_cont_is_stopping() during sleep cycles. Additionally, enhanced error logging when container destruction is already in progress. Test-tag: test_soak_smoke pr Signed-off-by: Wang Shilong --- src/container/srv_target.c | 2 ++ src/vos/vos_pool_scrub.c | 16 +++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/container/srv_target.c b/src/container/srv_target.c index 2718cbbc84a..071f6dabbc8 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -1262,6 +1262,8 @@ cont_child_destroy_one(void *vin) } if (cont->sc_destroying) { + D_ERROR(DF_CONT ": Container is already being destroyed\n", + DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid)); cont_child_put(tls->dt_cont_cache, cont); D_GOTO(out_pool, rc = -DER_BUSY); } diff --git a/src/vos/vos_pool_scrub.c b/src/vos/vos_pool_scrub.c index b158faafbc4..06b753aec55 100644 --- a/src/vos/vos_pool_scrub.c +++ b/src/vos/vos_pool_scrub.c @@ -272,6 +272,9 @@ sc_get_rec_in_chunk_at_idx(const struct scrub_ctx *ctx, uint32_t i) static void sc_wait_until_should_continue(struct scrub_ctx *ctx) { + if (sc_cont_is_stopping(ctx)) + return; + if (sc_mode(ctx) == DAOS_SCRUB_MODE_TIMED) { struct timespec now; uint64_t msec_between; @@ -279,6 +282,8 @@ sc_wait_until_should_continue(struct scrub_ctx *ctx) d_gettime(&now); while ((msec_between = sc_get_ms_between_scrubs(ctx)) > 0) { d_tm_set_gauge(ctx->sc_metrics.scm_next_csum_scrub, msec_between); + if (sc_cont_is_stopping(ctx)) + break; /* don't wait longer than 1 sec each loop */ sc_sleep(ctx, min(1000, msec_between)); } @@ -286,6 +291,8 @@ sc_wait_until_should_continue(struct scrub_ctx *ctx) sc_sleep(ctx, 0); while (!sc_is_idle(ctx) && sc_mode(ctx) == DAOS_SCRUB_MODE_LAZY) { sc_m_track_busy(ctx); + if (sc_cont_is_stopping(ctx)) + break; /* Don't actually know how long it will be but wait for 1 second before * trying again */ @@ -293,10 +300,17 @@ sc_wait_until_should_continue(struct scrub_ctx *ctx) } sc_m_track_idle(ctx); } else { + uint64_t sleep_seconds = 300; + D_ERROR("Unknown Scrub Mode: %d, Pool: " DF_UUID "\n", sc_mode(ctx), DP_UUID(ctx->sc_pool->sp_uuid)); /* sleep for 5 minutes to give pool property chance to resolve */ - sc_sleep(ctx, 1000 * 60 * 5); + while (sleep_seconds > 0) { + if (sc_cont_is_stopping(ctx)) + break; + sc_sleep(ctx, 1000); + sleep_seconds--; + } } } From 3258ac9e1b0cdaf2f90c9c3d7042ab4104a720ca Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Mon, 24 Feb 2025 21:03:06 -0500 Subject: [PATCH 2/4] address comments Signed-off-by: Wang Shilong --- src/vos/vos_pool_scrub.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/vos/vos_pool_scrub.c b/src/vos/vos_pool_scrub.c index 06b753aec55..2a1f0fa299f 100644 --- a/src/vos/vos_pool_scrub.c +++ b/src/vos/vos_pool_scrub.c @@ -300,17 +300,10 @@ sc_wait_until_should_continue(struct scrub_ctx *ctx) } sc_m_track_idle(ctx); } else { - uint64_t sleep_seconds = 300; - D_ERROR("Unknown Scrub Mode: %d, Pool: " DF_UUID "\n", sc_mode(ctx), DP_UUID(ctx->sc_pool->sp_uuid)); /* sleep for 5 minutes to give pool property chance to resolve */ - while (sleep_seconds > 0) { - if (sc_cont_is_stopping(ctx)) - break; - sc_sleep(ctx, 1000); - sleep_seconds--; - } + sc_sleep(ctx, 1000 * 60 * 5); } } From 97b3557a8c4424fa1511fa600a615bba2a3a0092 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Mon, 24 Feb 2025 21:08:54 -0500 Subject: [PATCH 3/4] test tag Test-tag: test_soak_smoke pr Signed-off-by: Wang Shilong From 9631f72957742d9e34aa95c1337f97edea9cb7ba Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Thu, 27 Feb 2025 10:42:07 -0500 Subject: [PATCH 4/4] address comments Test-tag: test_soak_smoke pr Signed-off-by: Wang Shilong --- src/vos/vos_pool_scrub.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/vos/vos_pool_scrub.c b/src/vos/vos_pool_scrub.c index 2a1f0fa299f..490a3737682 100644 --- a/src/vos/vos_pool_scrub.c +++ b/src/vos/vos_pool_scrub.c @@ -282,21 +282,21 @@ sc_wait_until_should_continue(struct scrub_ctx *ctx) d_gettime(&now); while ((msec_between = sc_get_ms_between_scrubs(ctx)) > 0) { d_tm_set_gauge(ctx->sc_metrics.scm_next_csum_scrub, msec_between); - if (sc_cont_is_stopping(ctx)) - break; /* don't wait longer than 1 sec each loop */ sc_sleep(ctx, min(1000, msec_between)); + if (sc_cont_is_stopping(ctx)) + break; } } else if (sc_mode(ctx) == DAOS_SCRUB_MODE_LAZY) { sc_sleep(ctx, 0); while (!sc_is_idle(ctx) && sc_mode(ctx) == DAOS_SCRUB_MODE_LAZY) { sc_m_track_busy(ctx); - if (sc_cont_is_stopping(ctx)) - break; /* Don't actually know how long it will be but wait for 1 second before * trying again */ sc_sleep(ctx, 1000); + if (sc_cont_is_stopping(ctx)) + break; } sc_m_track_idle(ctx); } else {