From b83f05c5f1fbefb8a5a867526285ff4034eeb54b Mon Sep 17 00:00:00 2001 From: wangdi Date: Tue, 16 Jan 2024 09:59:37 -0800 Subject: [PATCH 1/7] DAOS-14972 pool: Only allow UPIN engine as PS replicas (#13593) Only allow UPIN engine as PS replicas to avoid IV leader switch hashles during rebuild for the moment. Signed-off-by: Di Wang --- src/pool/srv_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pool/srv_internal.h b/src/pool/srv_internal.h index 0d9854fd436..c95f15cc715 100644 --- a/src/pool/srv_internal.h +++ b/src/pool/srv_internal.h @@ -20,7 +20,7 @@ #define POOL_GROUP_MAP_STATES (PO_COMP_ST_UP | PO_COMP_ST_UPIN | PO_COMP_ST_DRAIN) /* Map states of ranks that make up the pool service */ -#define POOL_SVC_MAP_STATES (PO_COMP_ST_UP | PO_COMP_ST_UPIN) +#define POOL_SVC_MAP_STATES (PO_COMP_ST_UPIN) /* * Since we want all PS replicas to belong to the pool group, From 0e08ed011e62112575cbec943d1ffd228b4cd976 Mon Sep 17 00:00:00 2001 From: Cedric Koch-Hofer <94527853+knard-intel@users.noreply.github.com> Date: Tue, 16 Jan 2024 19:37:16 +0100 Subject: [PATCH 2/7] DAOS-14896 gurt: Fix d_getenv with negative int (#13586) Fix regression of d_getenv_xxx() functions used for retrieve int envioronment variable: support of string reprsenting signed integer. Signed-off-by: Cedric Koch-Hofer --- src/gurt/misc.c | 90 ++++++++++++++------------- src/gurt/tests/test_gurt.c | 123 +++++++++++++++++++++++++++++-------- 2 files changed, 143 insertions(+), 70 deletions(-) diff --git a/src/gurt/misc.c b/src/gurt/misc.c index de0a1ae6fd7..ffb1a85bb0d 100644 --- a/src/gurt/misc.c +++ b/src/gurt/misc.c @@ -25,8 +25,6 @@ #include #include -#define UINT64_MAX_STR "18446744073709551615" - /* state buffer for DAOS rand and srand calls, NOT thread safe */ static struct drand48_data randBuffer = {0}; @@ -951,18 +949,17 @@ d_rank_range_list_free(d_rank_range_list_t *range_list) } static inline bool -dis_unsigned_str(char *str) +dis_signed_str(char *str) { - char *eos; - - if (str == NULL || str[0] == '\0') - return false; + char *eos; + size_t str_size; - eos = str + (sizeof(UINT64_MAX_STR) - 1); - while (str != eos && *str != '\0' && *str >= '0' && *str <= '9') + str_size = strlen(str); + eos = str + str_size; + while (str != eos && *str != '-' && (*str < '0' || *str > '9')) ++str; - return *str == '\0'; + return *str == '-'; } static inline bool @@ -1214,40 +1211,68 @@ d_getenv_char(const char *name, char *char_val) } static int -d_getenv_ull(unsigned long long *val, const char *name) +d_getenv_ull(unsigned long long *val, const char *name, size_t val_size) { char *env; + char *env_tmp = NULL; char *endptr; - unsigned long long tmp; + unsigned long long val_tmp; int rc; assert(val != NULL); assert(name != NULL); + assert(val_size <= sizeof(unsigned long long)); d_env_rwlock_rdlock(); env = getenv(name); if (env == NULL) { rc = -DER_NONEXIST; + d_env_rwlock_unlock(); goto out; } - if (!dis_unsigned_str(env)) { - rc = -DER_INVAL; + /* DAOS-14896 NOTES: + * - Duplicate env to reduce data race condition with external libraries not using the DAOS + * thread safe environment variables management API. + * - Use of strdup() as there is no limit to environment variable size. + */ + env_tmp = strdup(env); + if (env_tmp == NULL) { + rc = -DER_NOMEM; + d_env_rwlock_unlock(); goto out; } + d_env_rwlock_unlock(); - errno = 0; - tmp = strtoull(env, &endptr, 0); - if (errno != 0 || endptr == env || *endptr != '\0') { + errno = 0; + val_tmp = strtoull(env_tmp, &endptr, 10); + if (errno != 0 || endptr == env_tmp || *endptr != '\0') { rc = -DER_INVAL; goto out; } - *val = tmp; + if (val_size != sizeof(unsigned long long)) { + const unsigned long long val_max = (1ull << val_size * 8) - 1; + const bool is_signed = dis_signed_str(env_tmp); + + if (is_signed) + val_tmp = ~val_tmp; + if (val_tmp > val_max || (is_signed && val_tmp >= val_max)) { + rc = -DER_INVAL; + goto out; + } + if (is_signed) { + val_tmp = ~val_tmp; + val_tmp <<= (sizeof(unsigned long long) - val_size) * 8; + val_tmp >>= (sizeof(unsigned long long) - val_size) * 8; + } + } + + *val = val_tmp; rc = -DER_SUCCESS; out: - d_env_rwlock_unlock(); + free(env_tmp); return rc; } @@ -1269,17 +1294,10 @@ d_getenv_uint(const char *name, unsigned *uint_val) assert(uint_val != NULL); assert(name != NULL); - rc = d_getenv_ull(&tmp, name); + rc = d_getenv_ull(&tmp, name, sizeof(unsigned)); if (rc != -DER_SUCCESS) return rc; -#if UINT_MAX != ULLONG_MAX - assert(sizeof(unsigned) < sizeof(unsigned long long)); - if (tmp > UINT_MAX) { - return -DER_INVAL; - } -#endif - *uint_val = (unsigned)tmp; return -DER_SUCCESS; } @@ -1301,17 +1319,10 @@ d_getenv_uint32_t(const char *name, uint32_t *uint32_val) assert(uint32_val != NULL); assert(name != NULL); - rc = d_getenv_ull(&tmp, name); + rc = d_getenv_ull(&tmp, name, sizeof(uint32_t)); if (rc != -DER_SUCCESS) return rc; -#if UINT32_MAX != ULLONG_MAX - assert(sizeof(uint32_t) < sizeof(unsigned long long)); - if (tmp > UINT32_MAX) { - return -DER_INVAL; - } -#endif - *uint32_val = (uint32_t)tmp; return -DER_SUCCESS; } @@ -1333,17 +1344,10 @@ d_getenv_uint64_t(const char *name, uint64_t *uint64_val) assert(uint64_val != NULL); assert(name != NULL); - rc = d_getenv_ull(&tmp, name); + rc = d_getenv_ull(&tmp, name, sizeof(uint64_t)); if (rc != -DER_SUCCESS) return rc; -#if UINT64_MAX != ULLONG_MAX - assert(sizeof(uint64_t) < sizeof(unsigned long long)); - if (tmp > UINT64_MAX) { - return -DER_INVAL; - } -#endif - *uint64_val = (uint64_t)tmp; return -DER_SUCCESS; } diff --git a/src/gurt/tests/test_gurt.c b/src/gurt/tests/test_gurt.c index 49db0a883e6..ebb9a0ec701 100644 --- a/src/gurt/tests/test_gurt.c +++ b/src/gurt/tests/test_gurt.c @@ -2288,35 +2288,63 @@ test_d_getenv_uint(void **state) assert_int_equal(rc, -DER_SUCCESS); assert_true(val == UINT_MAX); - getenv_return = "42"; + getenv_return = "-1"; + rc = d_getenv_uint("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == UINT_MAX); + + getenv_return = "-10"; + rc = d_getenv_uint("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == UINT_MAX - 9); + + getenv_return = "-4294967294"; + rc = d_getenv_uint("foo", &val); + assert_true(val == 2); + + getenv_return = "-4294967295"; + rc = d_getenv_uint("foo", &val); + assert_true(val == 1); + + getenv_return = " 000042"; rc = d_getenv_uint("foo", &val); assert_int_equal(rc, -DER_SUCCESS); assert_true(val == 42); + getenv_return = " -000042"; + rc = d_getenv_uint("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == -42); + getenv_return = "4294967296"; rc = d_getenv_uint("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); - getenv_return = "-42"; + getenv_return = "-4294967296"; rc = d_getenv_uint("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); getenv_return = "booo"; rc = d_getenv_uint("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); getenv_return = "42booo"; rc = d_getenv_uint("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); + + getenv_return = ""; + rc = d_getenv_uint("foo", &val); + assert_int_equal(rc, -DER_INVAL); + assert_true(val == -42); getenv_return = NULL; rc = d_getenv_uint("foo", &val); assert_int_equal(rc, -DER_NONEXIST); - assert_true(val == 42); + assert_true(val == -42); } static void @@ -2330,40 +2358,63 @@ test_d_getenv_uint32_t(void **state) assert_int_equal(rc, -DER_SUCCESS); assert_true(val == UINT32_MAX); - getenv_return = "42"; + getenv_return = "-1"; + rc = d_getenv_uint32_t("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == UINT32_MAX); + + getenv_return = "-10"; + rc = d_getenv_uint32_t("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == UINT32_MAX - 9); + + getenv_return = "-4294967294"; + rc = d_getenv_uint32_t("foo", &val); + assert_true(val == 2); + + getenv_return = "-4294967295"; + rc = d_getenv_uint32_t("foo", &val); + assert_true(val == 1); + + getenv_return = " 000042"; rc = d_getenv_uint32_t("foo", &val); assert_int_equal(rc, -DER_SUCCESS); assert_true(val == 42); + getenv_return = " -000042"; + rc = d_getenv_uint32_t("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == -42); + getenv_return = "4294967296"; rc = d_getenv_uint32_t("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); - getenv_return = "-42"; + getenv_return = "-4294967296"; rc = d_getenv_uint32_t("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); getenv_return = "booo"; rc = d_getenv_uint32_t("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); getenv_return = "42booo"; rc = d_getenv_uint32_t("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); getenv_return = ""; rc = d_getenv_uint32_t("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); getenv_return = NULL; rc = d_getenv_uint32_t("foo", &val); assert_int_equal(rc, -DER_NONEXIST); - assert_true(val == 42); + assert_true(val == -42); } static void @@ -2377,45 +2428,63 @@ test_d_getenv_uint64_t(void **state) assert_int_equal(rc, -DER_SUCCESS); assert_true(val == UINT64_MAX); - getenv_return = "42"; + getenv_return = "-1"; rc = d_getenv_uint64_t("foo", &val); assert_int_equal(rc, -DER_SUCCESS); - assert_true(val == 42); + assert_true(val == UINT64_MAX); - getenv_return = "18446744073709551616"; + getenv_return = "-10"; rc = d_getenv_uint64_t("foo", &val); - assert_int_equal(rc, -DER_INVAL); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == UINT64_MAX - 9); + + getenv_return = "-18446744073709551614"; + rc = d_getenv_uint64_t("foo", &val); + assert_true(val == 2); + + getenv_return = "-18446744073709551615"; + rc = d_getenv_uint64_t("foo", &val); + assert_true(val == 1); + + getenv_return = " 000042"; + rc = d_getenv_uint64_t("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); assert_true(val == 42); - getenv_return = "012345678901234567890"; + getenv_return = " -000042"; + rc = d_getenv_uint64_t("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == -42); + + getenv_return = "18446744073709551616"; rc = d_getenv_uint64_t("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); - getenv_return = "-42"; + getenv_return = "-18446744073709551616"; rc = d_getenv_uint64_t("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); getenv_return = "booo"; rc = d_getenv_uint64_t("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); getenv_return = "42booo"; rc = d_getenv_uint64_t("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); getenv_return = ""; rc = d_getenv_uint64_t("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); getenv_return = NULL; rc = d_getenv_uint64_t("foo", &val); assert_int_equal(rc, -DER_NONEXIST); - assert_true(val == 42); + assert_true(val == -42); } static void From 8ca9d227bb69067187a21b434940c2fc873a5403 Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Tue, 16 Jan 2024 10:47:30 -0800 Subject: [PATCH 3/7] DAOS-14594 test: fix dfs_parallel xml generation (#13312) Since cmocka is not MPI-aware, force all ranks other than rank 0 to write to stdout to avoid race conditions with the XML file. Signed-off-by: Dalton Bohning --- src/tests/suite/dfs_test.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/src/tests/suite/dfs_test.c b/src/tests/suite/dfs_test.c index f35e9df0f7b..217f30ad178 100644 --- a/src/tests/suite/dfs_test.c +++ b/src/tests/suite/dfs_test.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2022 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -81,16 +81,17 @@ run_specified_tests(const char *tests, int rank, int size, int main(int argc, char **argv) { - test_arg_t *arg; - char tests[64]; - char *exclude_str = NULL; - int ntests = 0; - int nr_failed = 0; - int nr_total_failed = 0; - int opt = 0, index = 0; - int rank; - int size; - int rc; + test_arg_t *arg; + char tests[64]; + char *exclude_str = NULL; + char *cmocka_message_output = NULL; + int ntests = 0; + int nr_failed = 0; + int nr_total_failed = 0; + int opt = 0, index = 0; + int rank; + int size; + int rc; d_register_alt_assert(mock_assert); @@ -166,6 +167,16 @@ main(int argc, char **argv) tests[new_idx] = '\0'; } + /** if writing XML, force all ranks other than rank 0 to use stdout to avoid conflicts */ + cmocka_message_output = getenv("CMOCKA_MESSAGE_OUTPUT"); + if (rank != 0 && cmocka_message_output && strcasecmp(cmocka_message_output, "xml") == 0) { + rc = d_setenv("CMOCKA_MESSAGE_OUTPUT", "stdout", 1); + if (rc) { + print_message("d_setenv() failed with %d\n", rc); + return -1; + } + } + nr_failed = run_specified_tests(tests, rank, size, NULL, 0); exit: From 8599cebf0d672a8e6dda4b566b416f25a14b2ffc Mon Sep 17 00:00:00 2001 From: wangdi Date: Wed, 17 Jan 2024 04:38:22 -0800 Subject: [PATCH 4/7] DAOS-14845 rebuild: do not wait for EC agg for reclaim (#13610) Do not need wait for EC aggregation for reclaim operation, which does not involve fetch and update. Signed-off-by: Di Wang --- src/rebuild/scan.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 4c2c78c4bee..352459a2d84 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2017-2023 Intel Corporation. + * (C) Copyright 2017-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -882,12 +882,13 @@ rebuild_container_scan_cb(daos_handle_t ih, vos_iter_entry_t *entry, } /* Wait for EC aggregation to finish. NB: migrate needs to wait for EC aggregation to finish */ - while (cont_child->sc_ec_agg_active) { + while (cont_child->sc_ec_agg_active && + rpt->rt_rebuild_op != RB_OP_RECLAIM && + rpt->rt_rebuild_op != RB_OP_FAIL_RECLAIM) { D_ASSERTF(rpt->rt_pool->sp_rebuilding >= 0, DF_UUID" rebuilding %d\n", DP_UUID(rpt->rt_pool_uuid), rpt->rt_pool->sp_rebuilding); /* Wait for EC aggregation to abort before discard the object */ - D_DEBUG(DB_REBUILD, DF_UUID" wait for ec agg abort.\n", - DP_UUID(entry->ie_couuid)); + D_INFO(DF_UUID" wait for ec agg abort.\n", DP_UUID(entry->ie_couuid)); dss_sleep(1000); if (rpt->rt_abort || rpt->rt_finishing) { D_DEBUG(DB_REBUILD, DF_CONT" rebuild op %s ver %u abort %u/%u.\n", From fd9d630462fff3f57be732bbf05dc1102d69a7d1 Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Wed, 17 Jan 2024 07:42:26 -0500 Subject: [PATCH 5/7] DAOS-14969 test: Increase crt_timeout for test_daos_oid_allocator (#13599) * DAOS-14969 test: Increase crt_timeout for test_daos_oid_allocator Temporarily increase the crt_timeout for the test_daos_oid_allocator test to 60 seconds. Signed-off-by: Phil Henderson --- src/tests/ftest/daos_test/suite.yaml | 2 ++ src/tests/ftest/util/daos_core_base.py | 10 +++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/tests/ftest/daos_test/suite.yaml b/src/tests/ftest/daos_test/suite.yaml index afcc048f965..5d08fb4493d 100644 --- a/src/tests/ftest/daos_test/suite.yaml +++ b/src/tests/ftest/daos_test/suite.yaml @@ -191,3 +191,5 @@ daos_tests: test_daos_extend_simple: 5 test_daos_rebuild_ec: 43 test_daos_degraded_ec: 29 + crt_timeout: + test_daos_oid_allocator: 60 diff --git a/src/tests/ftest/util/daos_core_base.py b/src/tests/ftest/util/daos_core_base.py index 9bf0ff4c501..1baa93b91b4 100644 --- a/src/tests/ftest/util/daos_core_base.py +++ b/src/tests/ftest/util/daos_core_base.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2018-2023 Intel Corporation. + (C) Copyright 2018-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -96,6 +96,14 @@ def start_server_managers(self, force=False): ["=".join(items) for items in list(env_dict.items())] ) + # Update any other server settings unique to this test method + for setting in ["crt_timeout"]: + value = self.get_test_param(setting) + if value: + for server_mgr in self.server_managers: + for engine_params in server_mgr.manager.job.yaml.engine_params: + engine_params.set_value(setting, value) + # Start the servers return super().start_server_managers(force=force) From a0503a0d1267b014ded36ef8217ada87686346ef Mon Sep 17 00:00:00 2001 From: wangdi Date: Wed, 17 Jan 2024 09:11:12 -0800 Subject: [PATCH 6/7] DAOS-14965 tests: using correct rd_fac in online_rebuild_single (#13600) Use rd_fac:2 in online_rebuild_single Use svcn: 5 Signed-off-by: Di Wang --- src/tests/ftest/erasurecode/online_rebuild_single.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tests/ftest/erasurecode/online_rebuild_single.yaml b/src/tests/ftest/erasurecode/online_rebuild_single.yaml index bda14dbb9e5..8b0b3f4baf0 100644 --- a/src/tests/ftest/erasurecode/online_rebuild_single.yaml +++ b/src/tests/ftest/erasurecode/online_rebuild_single.yaml @@ -30,9 +30,10 @@ server_config: storage: auto pool: size: 93% - svcn: 1 + svcn: 5 control_method: dmg pool_query_timeout: 30 + properties: rd_fac:2 container: type: POSIX control_method: API From da658ad7c603f768bc60d1206e55e0bced68bb8f Mon Sep 17 00:00:00 2001 From: "Brian J. Murrell" Date: Wed, 17 Jan 2024 16:36:52 -0500 Subject: [PATCH 7/7] DAOS-14440 build: Update distro versions in GHA (#13608) The supported distro versions got missed in 56e3228d36 for the GitHub Actions workflow. Signed-off-by: Brian J. Murrell --- .github/workflows/rpm-build-and-test.yml | 4 ++-- ci/functional/test_main.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/rpm-build-and-test.yml b/.github/workflows/rpm-build-and-test.yml index bbad2b715b4..232b3cbd971 100644 --- a/.github/workflows/rpm-build-and-test.yml +++ b/.github/workflows/rpm-build-and-test.yml @@ -5,10 +5,10 @@ env: # build is done on the lowest version and test on the highest with a "sanity test" # stage done on all versions in the list ecept the highest EL8_BUILD_VERSION: 8.6 - EL8_VERSION: 8 + EL8_VERSION: 8.8 EL9_BUILD_VERSION: 9 EL9_VERSION: 9 - LEAP15_VERSION: 15.4 + LEAP15_VERSION: 15.5 on: workflow_dispatch: diff --git a/ci/functional/test_main.sh b/ci/functional/test_main.sh index d318b3601e3..56fe36f8571 100755 --- a/ci/functional/test_main.sh +++ b/ci/functional/test_main.sh @@ -45,7 +45,7 @@ test_cluster() { FIRST_NODE=${first_node} \ TEST_RPMS=${TEST_RPMS} \ NODELIST=${tnodes} \ - BUILD_URL=\"$BUILD_URL\" \ + BUILD_URL=\"${BUILD_URL:-Unknown in GHA}\" \ STAGE_NAME=\"$STAGE_NAME\" \ $(cat ci/functional/test_main_prep_node.sh)" }