diff --git a/src/include/daos_srv/evtree.h b/src/include/daos_srv/evtree.h index 63224259ccc..8ae4f12638e 100644 --- a/src/include/daos_srv/evtree.h +++ b/src/include/daos_srv/evtree.h @@ -1,5 +1,6 @@ /** * (C) Copyright 2017-2023 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -810,4 +811,16 @@ evt_feats_get(struct evt_root *root) */ int evt_feats_set(struct evt_root *root, struct umem_instance *umm, uint64_t feats); +/** Validate the provided evt. + * + * Note: It is designed for catastrophic recovery. Not to perform at run-time. + * + * \param evt[in] + * \param dtx_lid[in] local id of the DTX entry the evt is supposed to belong to + * + * \return true if evt is valid. + **/ +bool +evt_desc_is_valid(const struct evt_desc *evt, uint32_t dtx_lid); + #endif /* __DAOS_EV_TREE_H__ */ diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py index 17f1753b100..a075bb0ecdd 100644 --- a/src/tests/ftest/util/telemetry_utils.py +++ b/src/tests/ftest/util/telemetry_utils.py @@ -1,5 +1,7 @@ """ (C) Copyright 2021-2024 Intel Corporation. +(C) Copyright 2025 Hewlett Packard Enterprise Development LP +(C) Copyright 2025 Google LLC SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -207,6 +209,8 @@ class TelemetryUtils(): _gen_stats_metrics("engine_io_dtx_committable") ENGINE_IO_DTX_COMMITTED_METRICS = \ _gen_stats_metrics("engine_io_dtx_committed") + ENGINE_IO_DTX_INVALID_METRICS = \ + _gen_stats_metrics("engine_io_dtx_invalid") ENGINE_IO_LATENCY_FETCH_METRICS = \ _gen_stats_metrics("engine_io_latency_fetch") ENGINE_IO_LATENCY_BULK_FETCH_METRICS = \ @@ -310,6 +314,7 @@ class TelemetryUtils(): ENGINE_IO_METRICS = ENGINE_IO_DTX_ASYNC_CMT_LAT_METRICS +\ ENGINE_IO_DTX_COMMITTABLE_METRICS +\ ENGINE_IO_DTX_COMMITTED_METRICS +\ + ENGINE_IO_DTX_INVALID_METRICS +\ ENGINE_IO_LATENCY_FETCH_METRICS +\ ENGINE_IO_LATENCY_BULK_FETCH_METRICS +\ ENGINE_IO_LATENCY_VOS_FETCH_METRICS +\ diff --git a/src/vos/evt_priv.h b/src/vos/evt_priv.h index e855a9c74b2..bbf94ecd6cf 100644 --- a/src/vos/evt_priv.h +++ b/src/vos/evt_priv.h @@ -1,5 +1,7 @@ /** * (C) Copyright 2017-2022 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -115,7 +117,6 @@ struct evt_context { umem_off2ptr(evt_umm(tcx), offset) #define EVT_NODE_MAGIC 0xf00d -#define EVT_DESC_MAGIC 0xbeefdead /** Convert an offset to a evtree node descriptor * \param[IN] tcx Tree context diff --git a/src/vos/evtree.c b/src/vos/evtree.c index d635453f8b2..883e3820bc6 100644 --- a/src/vos/evtree.c +++ b/src/vos/evtree.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2017-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -4086,3 +4087,12 @@ evt_feats_set(struct evt_root *root, struct umem_instance *umm, uint64_t feats) return rc; } +bool +evt_desc_is_valid(const struct evt_desc *evt, uint32_t dtx_lid) +{ + if (evt == NULL || evt->dc_magic != EVT_DESC_MAGIC) { + return false; + } + + return (evt->dc_dtx == dtx_lid); +} diff --git a/src/vos/ilog.c b/src/vos/ilog.c index 1d1d6508087..86ce9c2afea 100644 --- a/src/vos/ilog.c +++ b/src/vos/ilog.c @@ -1,5 +1,7 @@ /** * (C) Copyright 2019-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -390,16 +392,18 @@ ilog_create(struct umem_instance *umm, struct ilog_df *root) return rc; } -#define ILOG_ASSERT_VALID(root_df) \ - do { \ - struct ilog_root *_root; \ - \ - _root = (struct ilog_root *)(root_df); \ - D_ASSERTF((_root != NULL) && \ - ILOG_MAGIC_VALID(_root->lr_magic), \ - "Invalid ilog root detected %p magic=%#x\n", \ - _root, _root == NULL ? 0 : _root->lr_magic); \ - } while (0) +#define ILOG_CHECK_VALID(root_df) \ + ({ \ + struct ilog_root *_root = NULL; \ + D_ASSERT((root_df) != NULL); \ + _root = (struct ilog_root *)(root_df); \ + if (!ILOG_MAGIC_VALID(_root->lr_magic)) { \ + D_WARN("Invalid ilog root detected %p magic=%#x\n", _root, \ + _root == NULL ? 0 : _root->lr_magic); \ + _root = NULL; \ + } \ + _root != NULL; \ + }) int ilog_open(struct umem_instance *umm, struct ilog_df *root, @@ -408,7 +412,8 @@ ilog_open(struct umem_instance *umm, struct ilog_df *root, struct ilog_context *lctx; int rc; - ILOG_ASSERT_VALID(root); + if (!ILOG_CHECK_VALID(root)) + return -DER_NONEXIST; rc = ilog_ctx_create(umm, (struct ilog_root *)root, cbs, &lctx); if (rc != 0) @@ -474,7 +479,7 @@ ilog_destroy(struct umem_instance *umm, int rc = 0; struct ilog_array_cache cache = {0}; - ILOG_ASSERT_VALID(root); + D_ASSERT(ILOG_CHECK_VALID(root)); rc = ilog_tx_begin(&lctx); if (rc != 0) { @@ -984,8 +989,12 @@ ilog_modify(daos_handle_t loh, const struct ilog_id *id_in, "%s in incarnation log " DF_X64 " status: rc=" DF_RC " tree_version: %d\n", opc_str[opc], id_in->id_epoch, DP_RC(rc), ilog_mag2ver(lctx->ic_root->lr_magic)); - if (rc == 0 && version != ilog_mag2ver(lctx->ic_root->lr_magic) && - (opc == ILOG_OP_PERSIST || opc == ILOG_OP_ABORT)) { + if (rc == 0 && opc != ILOG_OP_UPDATE) { + if (version == ilog_mag2ver(lctx->ic_root->lr_magic)) { + D_WARN("ilog entry on %s doesn't exist\n", opc_str[opc]); + return -DER_NONEXIST; + } + /** If we persisted or aborted an entry successfully, * invoke the callback, if applicable but without * deregistration @@ -1213,7 +1222,7 @@ ilog_fetch(struct umem_instance *umm, struct ilog_df *root_df, int rc = 0; bool retry; - ILOG_ASSERT_VALID(root_df); + D_ASSERT(ILOG_CHECK_VALID(root_df)); root = (struct ilog_root *)root_df; @@ -1539,7 +1548,7 @@ ilog_aggregate(struct umem_instance *umm, struct ilog_df *ilog, root = lctx->ic_root; - ILOG_ASSERT_VALID(root); + D_ASSERT(ILOG_CHECK_VALID(root)); D_ASSERT(!ilog_empty(root)); /* ilog_fetch should have failed */ diff --git a/src/vos/tests/vts_ilog.c b/src/vos/tests/vts_ilog.c index c696ff0b487..3982fc4c807 100644 --- a/src/vos/tests/vts_ilog.c +++ b/src/vos/tests/vts_ilog.c @@ -1,5 +1,7 @@ /** * (C) Copyright 2019-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -530,6 +532,12 @@ ilog_test_update(void **state) rc = entries_check(umm, ilog, &ilog_callbacks, NULL, 0, entries); assert_rc_equal(rc, 0); + /* Test non-existent tx */ + id.id_epoch = epoch; + id.id_tx_id = current_tx_id.id_tx_id + 4000; + rc = ilog_persist(loh, &id); + assert_rc_equal(rc, -DER_NONEXIST); + /* Commit the punch ilog. */ id.id_epoch = epoch; id.id_tx_id = current_tx_id.id_tx_id; @@ -668,6 +676,12 @@ ilog_test_abort(void **state) rc = entries_check(umm, ilog, &ilog_callbacks, NULL, 0, entries); assert_rc_equal(rc, 0); + /* Test non-existent tx */ + id = current_tx_id; + id.id_tx_id += 400; + rc = ilog_abort(loh, &id); + assert_rc_equal(rc, -DER_NONEXIST); + id = current_tx_id; rc = ilog_abort(loh, &id); LOG_FAIL(rc, 0, "Failed to abort log entry\n"); @@ -735,6 +749,11 @@ ilog_test_abort(void **state) rc = ilog_destroy(umm, &ilog_callbacks, ilog); assert_rc_equal(rc, 0); + /** Test open of "reallocated" ilog */ + memset(ilog, 0xa1, sizeof(*ilog)); + rc = ilog_open(umm, ilog, &ilog_callbacks, false, &loh); + assert_rc_equal(rc, -DER_NONEXIST); + assert_true(d_list_empty(&fake_tx_list)); ilog_free_root(umm, ilog); } diff --git a/src/vos/vos_common.c b/src/vos/vos_common.c index 011f8a8ccd5..cedda5dc756 100644 --- a/src/vos/vos_common.c +++ b/src/vos/vos_common.c @@ -581,6 +581,12 @@ vos_tls_init(int tags, int xs_id, int tgt_id) D_WARN("Failed to create committed cnt sensor: "DF_RC"\n", DP_RC(rc)); + rc = d_tm_add_metric(&tls->vtl_invalid_dtx, D_TM_STATS_GAUGE, + "Number of invalid active DTX", "entries", + "io/dtx/invalid/tgt_%u", tgt_id); + if (rc) + D_WARN("Failed to create invalid DTX cnt sensor: " DF_RC "\n", DP_RC(rc)); + rc = d_tm_add_metric(&tls->vtl_obj_cnt, D_TM_GAUGE, "Number of cached vos object", "entry", "mem/vos/vos_obj_%u/tgt_%u", diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index 2cf6ad2ec45..d9475442a5f 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -1,6 +1,7 @@ /** * (C) Copyright 2019-2024 Intel Corporation. * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -573,7 +574,7 @@ dtx_ilog_rec_release(struct umem_instance *umm, struct vos_container *cont, ilog_close(loh); - if (rc != 0) + if (rc != 0 && rc != -DER_NONEXIST) D_ERROR("Failed to release ilog rec for "DF_DTI", abort %s: "DF_RC"\n", DP_DTI(&DAE_XID(dae)), abort ? "yes" : "no", DP_RC(rc)); @@ -598,6 +599,12 @@ do_dtx_rec_release(struct umem_instance *umm, struct vos_container *cont, struct vos_irec_df *svt; svt = umem_off2ptr(umm, umem_off2offset(rec)); + + if (!vos_irec_is_valid(svt, DAE_LID(dae))) { + rc = -DER_NONEXIST; + break; + } + if (abort) { if (DAE_INDEX(dae) != DTX_INDEX_INVAL) { rc = umem_tx_add_ptr(umm, &svt->ir_dtx, @@ -621,6 +628,12 @@ do_dtx_rec_release(struct umem_instance *umm, struct vos_container *cont, struct evt_desc *evt; evt = umem_off2ptr(umm, umem_off2offset(rec)); + + if (!evt_desc_is_valid(evt, DAE_LID(dae))) { + rc = -DER_NONEXIST; + break; + } + if (abort) { if (DAE_INDEX(dae) != DTX_INDEX_INVAL) { rc = umem_tx_add_ptr(umm, &evt->dc_dtx, @@ -648,6 +661,15 @@ do_dtx_rec_release(struct umem_instance *umm, struct vos_container *cont, break; } + if (unlikely(rc == -DER_NONEXIST)) { + struct vos_tls *tls = vos_tls_get(false); + + D_WARN("DTX record no longer exists, may indicate some corruption: " + DF_DTI " type %u, discard\n", + DP_DTI(&DAE_XID(dae)), dtx_umoff_flag2type(rec)); + d_tm_inc_gauge(tls->vtl_invalid_dtx, 1); + } + return rc; } @@ -657,6 +679,7 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae, bool ab struct umem_instance *umm = vos_cont2umm(cont); struct vos_dtx_act_ent_df *dae_df; struct vos_dtx_blob_df *dbd; + bool invalid = false; int count; int i; int rc = 0; @@ -685,42 +708,52 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae, bool ab abort ? "abort" : "commit", DP_DTI(&DAE_XID(dae)), dbd, DP_UUID(cont->vc_pool->vp_id), DP_UUID(cont->vc_id)); - if (dae->dae_records != NULL) { + /* Handle DTX records as FIFO order to find out potential invalid DTX earlier. */ + + if (DAE_REC_CNT(dae) > DTX_INLINE_REC_CNT) + count = DTX_INLINE_REC_CNT; + else + count = DAE_REC_CNT(dae); + + for (i = 0; i < count; i++) { + rc = do_dtx_rec_release(umm, cont, dae, DAE_REC_INLINE(dae)[i], abort); + if (unlikely(rc == -DER_NONEXIST)) { + invalid = true; + break; + } + if (rc != 0) + return rc; + } + + if (!invalid && dae->dae_records != NULL) { D_ASSERT(DAE_REC_CNT(dae) > DTX_INLINE_REC_CNT); D_ASSERT(!UMOFF_IS_NULL(dae_df->dae_rec_off)); - for (i = DAE_REC_CNT(dae) - DTX_INLINE_REC_CNT - 1; i >= 0; i--) { + for (i = 0; i < DAE_REC_CNT(dae) - DTX_INLINE_REC_CNT; i++) { rc = do_dtx_rec_release(umm, cont, dae, dae->dae_records[i], abort); + if (unlikely(rc == -DER_NONEXIST)) { + invalid = true; + break; + } if (rc != 0) return rc; } + } + if (!UMOFF_IS_NULL(dae_df->dae_rec_off)) { rc = umem_free(umm, dae_df->dae_rec_off); if (rc != 0) return rc; - if (keep_act) { + if (!invalid && keep_act) { rc = umem_tx_add_ptr(umm, &dae_df->dae_rec_off, sizeof(dae_df->dae_rec_off)); if (rc != 0) return rc; - dae_df->dae_rec_off = UMOFF_NULL; } - - count = DTX_INLINE_REC_CNT; - } else { - D_ASSERT(DAE_REC_CNT(dae) <= DTX_INLINE_REC_CNT); - - count = DAE_REC_CNT(dae); - } - - for (i = count - 1; i >= 0; i--) { - rc = do_dtx_rec_release(umm, cont, dae, DAE_REC_INLINE(dae)[i], abort); - if (rc != 0) - return rc; } - if (keep_act) { + if (!invalid && keep_act) { /* When re-commit partial committed DTX, the count can be zero. */ if (dae_df->dae_rec_cnt > 0) { rc = umem_tx_add_ptr(umm, &dae_df->dae_rec_cnt, @@ -747,6 +780,9 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae, bool ab return 0; } + if (invalid) + rc = 0; + if (!UMOFF_IS_NULL(dae_df->dae_mbs_off)) { /* dae_mbs_off will be invalid via flag DTE_INVALID. */ rc = umem_free(umm, dae_df->dae_mbs_off); diff --git a/src/vos/vos_ilog.c b/src/vos/vos_ilog.c index 54abf2f407f..19bf0102e6e 100644 --- a/src/vos/vos_ilog.c +++ b/src/vos/vos_ilog.c @@ -1,5 +1,7 @@ /** * (C) Copyright 2019-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -427,6 +429,7 @@ int vos_ilog_update_(struct vos_container *cont, struct ilog_df *ilog, vos_ilog_desc_cbs_init(&cbs, vos_cont2hdl(cont)); rc = ilog_open(vos_cont2umm(cont), ilog, &cbs, dth == NULL, &loh); + D_ASSERTF(rc != -DER_NONEXIST, "Uncorrectable incarnation log corruption detected"); if (rc != 0) { D_ERROR("Could not open incarnation log: "DF_RC"\n", DP_RC(rc)); return rc; @@ -522,6 +525,7 @@ vos_ilog_punch_(struct vos_container *cont, struct ilog_df *ilog, punch_log: vos_ilog_desc_cbs_init(&cbs, vos_cont2hdl(cont)); rc = ilog_open(vos_cont2umm(cont), ilog, &cbs, dth == NULL, &loh); + D_ASSERTF(rc != -DER_NONEXIST, "Uncorrectable incarnation log corruption detected"); if (rc != 0) { D_ERROR("Could not open incarnation log: "DF_RC"\n", DP_RC(rc)); return rc; diff --git a/src/vos/vos_internal.h b/src/vos/vos_internal.h index 1e0a219df0b..05a2ff7afd4 100644 --- a/src/vos/vos_internal.h +++ b/src/vos/vos_internal.h @@ -1,6 +1,7 @@ /** * (C) Copyright 2016-2024 Intel Corporation. * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -454,6 +455,8 @@ struct vos_dtx_cmt_ent { #define DCE_EPOCH(dce) ((dce)->dce_base.dce_epoch) #define DCE_CMT_TIME(dce) ((dce)->dce_base.dce_cmt_time) +#define EVT_DESC_MAGIC 0xbeefdead + extern uint64_t vos_evt_feats; /** Flags for internal use - Bit 63 can be used for another purpose so as to @@ -1858,4 +1861,16 @@ vos_io_scm(struct vos_pool *pool, daos_iod_type_t type, daos_size_t size, enum v int vos_insert_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t *oid); +/** Validate the provided svt. + * + * Note: It is designed for catastrophic recovery. Not to perform at run-time. + * + * \param svt[in] + * \param dtx_lid[in] local id of the DTX entry the evt is supposed to belong to + * + * \return true if svt is valid. + **/ +bool +vos_irec_is_valid(const struct vos_irec_df *svt, uint32_t dtx_lid); + #endif /* __VOS_INTERNAL_H__ */ diff --git a/src/vos/vos_obj_index.c b/src/vos/vos_obj_index.c index b8123672aa4..35ff96872df 100644 --- a/src/vos/vos_obj_index.c +++ b/src/vos/vos_obj_index.c @@ -1,5 +1,7 @@ /** * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -289,6 +291,7 @@ vos_oi_find_alloc(struct vos_container *cont, daos_unit_oid_t oid, goto skip_log; vos_ilog_desc_cbs_init(&cbs, vos_cont2hdl(cont)); rc = ilog_open(vos_cont2umm(cont), &obj->vo_ilog, &cbs, dth == NULL, &loh); + D_ASSERTF(rc != -DER_NONEXIST, "Uncorrectable incarnation log corruption detected"); if (rc != 0) return rc; diff --git a/src/vos/vos_tls.h b/src/vos/vos_tls.h index 2fc328457d0..11f45beef17 100644 --- a/src/vos/vos_tls.h +++ b/src/vos/vos_tls.h @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -63,6 +64,7 @@ struct vos_tls { bool vtl_hash_set; }; struct d_tm_node_t *vtl_committed; + struct d_tm_node_t *vtl_invalid_dtx; struct d_tm_node_t *vtl_obj_cnt; struct d_tm_node_t *vtl_lru_alloc_size; }; diff --git a/src/vos/vos_tree.c b/src/vos/vos_tree.c index c36fcaa88c5..4beeb7e766f 100644 --- a/src/vos/vos_tree.c +++ b/src/vos/vos_tree.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1292,3 +1293,13 @@ obj_tree_find_attr(unsigned tree_class, int flags) return NULL; } } + +bool +vos_irec_is_valid(const struct vos_irec_df *svt, uint32_t dtx_lid) +{ + if (svt == NULL) { + return false; + } + + return svt->ir_dtx == dtx_lid; +}