diff --git a/src/common/btree.c b/src/common/btree.c index 9743f23deb4..42e3f05b879 100644 --- a/src/common/btree.c +++ b/src/common/btree.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1994,6 +1994,7 @@ btr_update(struct btr_context *tcx, d_iov_t *key, d_iov_t *val, d_iov_t *val_out struct btr_record *rec; int rc; char sbuf[BTR_PRINT_BUF]; + struct btr_trace *trace = &tcx->tc_trace[tcx->tc_depth - 1]; rec = btr_trace2rec(tcx, tcx->tc_depth - 1); @@ -2001,20 +2002,24 @@ btr_update(struct btr_context *tcx, d_iov_t *key, d_iov_t *val, d_iov_t *val_out btr_rec_string(tcx, rec, true, sbuf, BTR_PRINT_BUF)); rc = btr_rec_update(tcx, rec, key, val, val_out); - if (rc == -DER_NO_PERM) { /* cannot make inplace change */ - struct btr_trace *trace = &tcx->tc_trace[tcx->tc_depth - 1]; - + if (rc == -DER_NO_PERM) { + D_DEBUG(DB_TRACE, "Replace the original record\n"); if (btr_has_tx(tcx)) { rc = btr_node_tx_add(tcx, trace->tr_node); if (rc != 0) goto out; } - D_DEBUG(DB_TRACE, "Replace the original record\n"); rc = btr_rec_free(tcx, rec, NULL); if (rc) goto out; rc = btr_rec_alloc(tcx, key, val, rec, val_out); + } else if (rc == 1) { + D_DEBUG(DB_TRACE, "Replace the record by btr_rec_update()\n"); + if (btr_has_tx(tcx)) + rc = btr_node_tx_add(tcx, trace->tr_node); + else + rc = 0; } out: if (rc != 0) { /* failed */ diff --git a/src/include/daos_srv/evtree.h b/src/include/daos_srv/evtree.h index 4db46f38d90..63224259ccc 100644 --- a/src/include/daos_srv/evtree.h +++ b/src/include/daos_srv/evtree.h @@ -376,11 +376,7 @@ struct evt_entry_array { /** Maximum size of array */ uint32_t ea_max; /** Number of bytes per index */ - uint32_t ea_inob; - /** Index of first delete record, valid if ea_delete_nr != 0 */ - uint32_t ea_first_delete; - /** Number of delete records */ - uint32_t ea_delete_nr; + uint32_t ea_inob; /* Small array of embedded entries */ struct evt_list_entry ea_embedded_ents[0]; }; diff --git a/src/include/daos_srv/vos_types.h b/src/include/daos_srv/vos_types.h index 5bccb2d2ef2..15c30c158c9 100644 --- a/src/include/daos_srv/vos_types.h +++ b/src/include/daos_srv/vos_types.h @@ -15,7 +15,7 @@ #include #include -#define VOS_SUB_OP_MAX ((uint16_t)-2) +#define VOS_SUB_OP_MAX (((uint16_t)-1) >> 2) #define VOS_POOL_DF_2_2 24 #define VOS_POOL_DF_2_4 25 diff --git a/src/vos/evt_priv.h b/src/vos/evt_priv.h index a0eb8e42e10..e855a9c74b2 100644 --- a/src/vos/evt_priv.h +++ b/src/vos/evt_priv.h @@ -9,6 +9,21 @@ #include #include "vos_internal.h" +/** Upper two bits are reserved. On disk, this will be (uint16_t)-2) + * for backward compatibility reasons. + * For the upper bits, + * 00 means distributed transaction write + * 11 is reserved for max normal write and removal records + * 01 is reserved for rebuild writes + * 10 is reserved for future use + * + * When converting to not durable values, we want all rebuild records to be + * greater than any normal write records. So, we use a smaller value here. + */ +#define EVT_TX_MINOR_MAX_DF ((uint16_t)-2) +#define EVT_REBUILD_MINOR_MIN (VOS_SUB_OP_MAX + 1) +#define EVT_REBUILD_MINOR_MAX (EVT_REBUILD_MINOR_MIN + VOS_SUB_OP_MAX) + /** * Tree node types. * NB: a node can be both root and leaf. diff --git a/src/vos/evtree.c b/src/vos/evtree.c index d4adb67cfde..9f5b3e01f1d 100644 --- a/src/vos/evtree.c +++ b/src/vos/evtree.c @@ -42,7 +42,10 @@ static inline void evt_rect_read(struct evt_rect *rout, const struct evt_rect_df *rin) { rout->rc_epc = rin->rd_epc; - rout->rc_minor_epc = rin->rd_minor_epc; + if (rin->rd_minor_epc != EVT_TX_MINOR_MAX_DF) + rout->rc_minor_epc = rin->rd_minor_epc; + else + rout->rc_minor_epc = VOS_SUB_OP_MAX; evt_ext_read(&rout->rc_ex, rin); }; @@ -52,8 +55,11 @@ evt_rect_write(struct evt_rect_df *rout, const struct evt_rect *rin) { evt_len_write(rout, evt_rect_width(rin)); rout->rd_epc = rin->rc_epc; - rout->rd_minor_epc = rin->rc_minor_epc; rout->rd_lo = rin->rc_ex.ex_lo; + if (rin->rc_minor_epc != VOS_SUB_OP_MAX) + rout->rd_minor_epc = rin->rc_minor_epc; + else + rout->rd_minor_epc = EVT_TX_MINOR_MAX_DF; }; #define DF_BUF_LEN 128 @@ -159,10 +165,20 @@ time_cmp(uint64_t t1, uint64_t t2, int *out) return true; } - if (t1 < t2) + if (t1 < t2) { + /** Even if t2 is rebuild, overwrite just works here */ *out = RT_OVERLAP_OVER; - else + } else { *out = RT_OVERLAP_UNDER; + if (t2 == EVT_REBUILD_MINOR_MIN) { + /** If t1 is also a rebuild, we should return + * RT_OVERLAP_SAME to force adjustment of new rebuild + * minor epoch. + */ + if (t1 < EVT_REBUILD_MINOR_MAX) + *out = RT_OVERLAP_SAME; + } + } return false; } @@ -2249,6 +2265,7 @@ evt_insert(daos_handle_t toh, const struct evt_entry_in *entry, struct evt_filter filter; int rc; int alt_rc = 0; + bool overwrite = false; tcx = evt_hdl2tcx(toh); if (tcx == NULL) @@ -2279,6 +2296,13 @@ evt_insert(daos_handle_t toh, const struct evt_entry_in *entry, evt_ent_array_init(ent_array, 1); + /* For evt_remove_all, we only insert removals for things that + * are visible, so we shouldn't need to check for overwrite at + * all. + */ + if (entry->ei_rect.rc_minor_epc == EVT_MINOR_EPC_MAX) + goto run_tx; + filter.fr_ex = entry->ei_rect.rc_ex; filter.fr_epr.epr_lo = entry->ei_rect.rc_epc; filter.fr_epr.epr_hi = entry->ei_bound; @@ -2293,61 +2317,41 @@ evt_insert(daos_handle_t toh, const struct evt_entry_in *entry, return rc; if (ent_array->ea_ent_nr == 1) { - if (entry->ei_rect.rc_minor_epc == EVT_MINOR_EPC_MAX) { - /** Special case. This is an overlapping delete record - * which can happen when there are minor epochs - * involved. Rather than rejecting, insert prefix - * and/or suffix extents. + ent = evt_ent_array_get(ent_array, 0); + if (entry->ei_rect.rc_minor_epc == EVT_REBUILD_MINOR_MIN) { + ent_cpy = *entry; + ent_cpy.ei_rect.rc_minor_epc = ent->en_minor_epc + 1; + D_ASSERTF(ent_cpy.ei_rect.rc_minor_epc <= EVT_REBUILD_MINOR_MAX, + "minor_epc=%d\n", ent_cpy.ei_rect.rc_minor_epc); + /* This should never happen because otherwise, we would + * return no overlap. */ - ent = evt_ent_array_get(ent_array, 0); - if (ent->en_ext.ex_lo <= entry->ei_rect.rc_ex.ex_lo && - ent->en_ext.ex_hi >= entry->ei_rect.rc_ex.ex_hi) { - /** Nothing to do, existing extent contains - * the new one - */ - return 0; - } + D_ASSERTF(ent_cpy.ei_rect.rc_minor_epc > EVT_REBUILD_MINOR_MIN, + "minor_epc=%d\n", ent_cpy.ei_rect.rc_minor_epc); + entryp = &ent_cpy; + goto run_tx; } + overwrite = true; } +run_tx: rc = evt_tx_begin(tcx); if (rc != 0) return rc; if (tcx->tc_depth == 0) { /* empty tree */ - rc = evt_root_activate(tcx, entry); + rc = evt_root_activate(tcx, entryp); if (rc != 0) goto out; - } else if (tcx->tc_inob == 0 && entry->ei_inob != 0) { + } else if (tcx->tc_inob == 0 && entryp->ei_inob != 0) { rc = evt_root_tx_add(tcx); if (rc != 0) goto out; - tcx->tc_inob = tcx->tc_root->tr_inob = entry->ei_inob; + tcx->tc_inob = tcx->tc_root->tr_inob = entryp->ei_inob; } D_ASSERT(ent_array->ea_ent_nr <= 1); - if (ent_array->ea_ent_nr == 1) { - if (ent != NULL) { - memcpy(&ent_cpy, entry, sizeof(*entry)); - entryp = &ent_cpy; - /** We need to edit the existing extent */ - if (entry->ei_rect.rc_ex.ex_lo < ent->en_ext.ex_lo) { - ent_cpy.ei_rect.rc_ex.ex_hi = ent->en_ext.ex_lo - 1; - if (entry->ei_rect.rc_ex.ex_hi <= ent->en_ext.ex_hi) - goto insert; - /* There is also a suffix, so insert the prefix */ - rc = evt_insert_entry(tcx, entryp, csum_bufp); - if (rc != 0) - goto out; - } - - D_ASSERT(entry->ei_rect.rc_ex.ex_hi > ent->en_ext.ex_hi); - ent_cpy.ei_rect.rc_ex.ex_hi = entry->ei_rect.rc_ex.ex_hi; - ent_cpy.ei_rect.rc_ex.ex_lo = ent->en_ext.ex_hi + 1; - - /* Now insert the suffix */ - goto insert; - } + if (overwrite) { /* * NB: This is part of the current hack to keep "supporting" * overwrite for same epoch, full overwrite. @@ -2358,7 +2362,6 @@ evt_insert(daos_handle_t toh, const struct evt_entry_in *entry, goto out; } -insert: /* Phase-2: Inserting */ rc = evt_insert_entry(tcx, entryp, csum_bufp); @@ -2584,7 +2587,7 @@ evt_ent_array_fill(struct evt_context *tcx, enum evt_find_opc find_opc, int at; int i; int rc = 0; - bool has_agg = false; + bool has_agg = false; V_TRACE(DB_TRACE, "Searching rectangle "DF_RECT" opc=%d\n", DP_RECT(rect), find_opc); @@ -2704,27 +2707,29 @@ evt_ent_array_fill(struct evt_context *tcx, enum evt_find_opc find_opc, " overlaps with " DF_RECT "\n", DP_RECT(&rtmp), DP_RECT(rect)); - /* NB: This is temporary to allow full overwrite - * in same epoch to avoid breaking rebuild. - * Without some sequence number and client - * identifier, we can't do this robustly. - * There can be a race between rebuild and - * client doing different updates. But this - * isn't any worse than what we already have in - * place so I did it this way to minimize - * change while we decide how to handle this - * properly. - */ - if (range_overlap != RT_OVERLAP_SAME) { - D_ERROR("Same epoch partial " - "overwrite not supported:" - DF_RECT" overlaps with "DF_RECT - "\n", DP_RECT(rect), - DP_RECT(&rtmp)); - rc = -DER_VOS_PARTIAL_UPDATE; - goto out; + if (rect->rc_minor_epc != EVT_REBUILD_MINOR_MIN) { + if (range_overlap != RT_OVERLAP_SAME) { + D_ERROR("Same epoch partial overwrite not " + "supported: " DF_RECT + " overlaps with " DF_RECT "\n", + DP_RECT(rect), DP_RECT(&rtmp)); + rc = -DER_VOS_PARTIAL_UPDATE; + goto out; + } } - break; /* we can update the record in place */ + + if (ent_array->ea_ent_nr == 0) + break; + + /** If the extent is an overlap, partial or + * otherwise, we want to keep only the one with + * the highest minor epoch and let the caller + * deal with it. + */ + ent = evt_ent_array_get(ent_array, 0); + if (ent->en_minor_epc > rtmp.rc_minor_epc) + continue; + goto replace_ent; case EVT_FIND_SAME: if (range_overlap != RT_OVERLAP_SAME) continue; @@ -2762,6 +2767,7 @@ evt_ent_array_fill(struct evt_context *tcx, enum evt_find_opc find_opc, goto out; } +replace_ent: evt_entry_fill(tcx, node, i, rect, intent, ent); switch (find_opc) { default: diff --git a/src/vos/tests/evt_ctl.sh b/src/vos/tests/evt_ctl.sh index 8f60dd61d48..4191da8b585 100755 --- a/src/vos/tests/evt_ctl.sh +++ b/src/vos/tests/evt_ctl.sh @@ -1,7 +1,6 @@ #!/bin/bash -if [ "$USE_VALGRIND" = "memcheck" ]; then - VCMD="valgrind --leak-check=full --show-reachable=yes --error-limit=no \ +if [ "$USE_VALGRIND" = "memcheck" ]; then VCMD="valgrind --leak-check=full --show-reachable=yes --error-limit=no \ --suppressions=${VALGRIND_SUPP} --error-exitcode=42 --xml=yes \ --xml-file=unit-test-evt_ctl-%p.memcheck.xml" elif [ "$USE_VALGRIND" = "pmemcheck" ]; then @@ -167,6 +166,9 @@ cmd+=" -C o:5 -a 0-1@1:ab -a 1-2@2:cd -a 3-4@3:bc -a 5-7@4:def -a 6-8@5:xyz" cmd+=" -a 1-2@6:aa -a 4-7@7:abcd -b -2 -r 0-5@8 -b -2 -r 0-5@3 -b -2 -D" cmd+=" -C o:4 -a 0-5@1.1:abcdef -a 5-10@1.2:fedcba -a 5-7@1.3:foo -r 2-8@0-1" cmd+=" -l0-10@0-10:c -r 7-9@0-1 -l0-10@0-10:C -b -2 -D" +cmd+=" -C o:15 -a 0-5@1.1:abcdef -a 1-2@1.2:ff -a 3-5@1.14:abc -a -3-4@1.14:ab" +cmd+=" -a 3-5@1.14:abc -a 2-4@1.16384:vab -a 3-4@1.16384:ab -a 3-3@1.16384:a" +cmd+=" -b -2 -D" echo "$cmd" eval "$cmd" result="${PIPESTATUS[0]}" diff --git a/src/vos/vos_internal.h b/src/vos/vos_internal.h index 72b1ef57acd..92e69cc8ad5 100644 --- a/src/vos/vos_internal.h +++ b/src/vos/vos_internal.h @@ -27,8 +27,7 @@ #include "vos_ilog.h" #include "vos_obj.h" -#define VOS_MINOR_EPC_MAX (VOS_SUB_OP_MAX + 1) -D_CASSERT(VOS_MINOR_EPC_MAX == EVT_MINOR_EPC_MAX); +#define VOS_MINOR_EPC_MAX EVT_MINOR_EPC_MAX #define VOS_TX_LOG_FAIL(rc, ...) \ do { \ diff --git a/src/vos/vos_io.c b/src/vos/vos_io.c index ce332e875b6..a27af4be11a 100644 --- a/src/vos/vos_io.c +++ b/src/vos/vos_io.c @@ -67,17 +67,11 @@ struct vos_io_context { /** the total size of the IO */ uint64_t ic_io_size; /** flags */ - unsigned int ic_update:1, - ic_size_fetch:1, - ic_save_recx:1, - ic_dedup:1, /** candidate for dedup */ - ic_dedup_verify:1, - ic_read_ts_only:1, - ic_check_existence:1, - ic_remove:1, - ic_skip_fetch:1, - ic_agg_needed:1, - ic_ec:1; /**< see VOS_OF_EC */ + unsigned int ic_update : 1, ic_size_fetch : 1, ic_save_recx : 1, + ic_dedup : 1, /** candidate for dedup */ + ic_dedup_verify : 1, ic_read_ts_only : 1, ic_check_existence : 1, ic_remove : 1, + ic_skip_fetch : 1, ic_agg_needed : 1, ic_skip_akey_support : 1, ic_rebuild : 1, + ic_ec : 1; /**< see VOS_OF_EC */ /** * Input shadow recx lists, one for each iod. Now only used for degraded * mode EC obj fetch handling. @@ -639,6 +633,7 @@ vos_ioc_create(daos_handle_t coh, daos_unit_oid_t oid, bool read_only, ioc->ic_read_ts_only = 1; ioc->ic_remove = ((vos_flags & VOS_OF_REMOVE) != 0); ioc->ic_ec = ((vos_flags & VOS_OF_EC) != 0); + ioc->ic_rebuild = ((vos_flags & VOS_OF_REBUILD) != 0); ioc->ic_umoffs_cnt = ioc->ic_umoffs_at = 0; ioc->ic_iod_csums = iod_csums; vos_ilog_fetch_init(&ioc->ic_dkey_info); @@ -2296,6 +2291,7 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, struct vos_io_context *ioc = vos_ioh2ioc(ioh); struct umem_instance *umem; bool tx_started = false; + uint16_t minor_epc; D_ASSERT(ioc->ic_update); vos_dedup_verify_fini(ioh); @@ -2340,9 +2336,13 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, if (err != 0) goto abort; + if (dtx_is_valid_handle(dth)) + minor_epc = dth->dth_op_seq; + else + minor_epc = ioc->ic_rebuild ? EVT_REBUILD_MINOR_MIN : VOS_SUB_OP_MAX; + /* Update tree index */ - err = dkey_update(ioc, pm_ver, dkey, dtx_is_valid_handle(dth) ? - dth->dth_op_seq : VOS_SUB_OP_MAX); + err = dkey_update(ioc, pm_ver, dkey, minor_epc); if (err) { VOS_TX_LOG_FAIL(err, "Failed to update tree index: "DF_RC"\n", DP_RC(err)); diff --git a/src/vos/vos_tree.c b/src/vos/vos_tree.c index bd2d1c45555..228df1e5069 100644 --- a/src/vos/vos_tree.c +++ b/src/vos/vos_tree.c @@ -674,7 +674,12 @@ svt_rec_update(struct btr_instance *tins, struct btr_record *rec, if (rc != 0) return rc; - return svt_rec_alloc_common(tins, rec, skey, rbund); + rc = svt_rec_alloc_common(tins, rec, skey, rbund); + if (rc != 0) + return rc; + + /* Inform btr_update() that the original record is replaced */ + return 1; } static int