Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport two changes that can improve rebuild stability #14452

Merged
merged 4 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions src/common/btree.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2016-2023 Intel Corporation.
* (C) Copyright 2016-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1994,27 +1994,32 @@ btr_update(struct btr_context *tcx, d_iov_t *key, d_iov_t *val, d_iov_t *val_out
struct btr_record *rec;
int rc;
char sbuf[BTR_PRINT_BUF];
struct btr_trace *trace = &tcx->tc_trace[tcx->tc_depth - 1];

rec = btr_trace2rec(tcx, tcx->tc_depth - 1);

D_DEBUG(DB_TRACE, "Update record %s\n",
btr_rec_string(tcx, rec, true, sbuf, BTR_PRINT_BUF));

rc = btr_rec_update(tcx, rec, key, val, val_out);
if (rc == -DER_NO_PERM) { /* cannot make inplace change */
struct btr_trace *trace = &tcx->tc_trace[tcx->tc_depth - 1];

if (rc == -DER_NO_PERM) {
D_DEBUG(DB_TRACE, "Replace the original record\n");
if (btr_has_tx(tcx)) {
rc = btr_node_tx_add(tcx, trace->tr_node);
if (rc != 0)
goto out;
}

D_DEBUG(DB_TRACE, "Replace the original record\n");
rc = btr_rec_free(tcx, rec, NULL);
if (rc)
goto out;
rc = btr_rec_alloc(tcx, key, val, rec, val_out);
} else if (rc == 1) {
D_DEBUG(DB_TRACE, "Replace the record by btr_rec_update()\n");
if (btr_has_tx(tcx))
rc = btr_node_tx_add(tcx, trace->tr_node);
else
rc = 0;
}
out:
if (rc != 0) { /* failed */
Expand Down
6 changes: 1 addition & 5 deletions src/include/daos_srv/evtree.h
Original file line number Diff line number Diff line change
Expand Up @@ -376,11 +376,7 @@ struct evt_entry_array {
/** Maximum size of array */
uint32_t ea_max;
/** Number of bytes per index */
uint32_t ea_inob;
/** Index of first delete record, valid if ea_delete_nr != 0 */
uint32_t ea_first_delete;
/** Number of delete records */
uint32_t ea_delete_nr;
uint32_t ea_inob;
/* Small array of embedded entries */
struct evt_list_entry ea_embedded_ents[0];
};
Expand Down
2 changes: 1 addition & 1 deletion src/include/daos_srv/vos_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#include <daos/dtx.h>
#include <daos/checksum.h>

#define VOS_SUB_OP_MAX ((uint16_t)-2)
#define VOS_SUB_OP_MAX (((uint16_t)-1) >> 2)

#define VOS_POOL_DF_2_2 24
#define VOS_POOL_DF_2_4 25
Expand Down
15 changes: 15 additions & 0 deletions src/vos/evt_priv.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,21 @@
#include <daos_srv/evtree.h>
#include "vos_internal.h"

/** Upper two bits are reserved. On disk, this will be (uint16_t)-2)
* for backward compatibility reasons.
* For the upper bits,
* 00 means distributed transaction write
* 11 is reserved for max normal write and removal records
* 01 is reserved for rebuild writes
* 10 is reserved for future use
*
* When converting to not durable values, we want all rebuild records to be
* greater than any normal write records. So, we use a smaller value here.
*/
#define EVT_TX_MINOR_MAX_DF ((uint16_t)-2)
#define EVT_REBUILD_MINOR_MIN (VOS_SUB_OP_MAX + 1)
#define EVT_REBUILD_MINOR_MAX (EVT_REBUILD_MINOR_MIN + VOS_SUB_OP_MAX)

/**
* Tree node types.
* NB: a node can be both root and leaf.
Expand Down
134 changes: 70 additions & 64 deletions src/vos/evtree.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,10 @@ static inline void
evt_rect_read(struct evt_rect *rout, const struct evt_rect_df *rin)
{
rout->rc_epc = rin->rd_epc;
rout->rc_minor_epc = rin->rd_minor_epc;
if (rin->rd_minor_epc != EVT_TX_MINOR_MAX_DF)
rout->rc_minor_epc = rin->rd_minor_epc;
else
rout->rc_minor_epc = VOS_SUB_OP_MAX;
evt_ext_read(&rout->rc_ex, rin);
};

Expand All @@ -52,8 +55,11 @@ evt_rect_write(struct evt_rect_df *rout, const struct evt_rect *rin)
{
evt_len_write(rout, evt_rect_width(rin));
rout->rd_epc = rin->rc_epc;
rout->rd_minor_epc = rin->rc_minor_epc;
rout->rd_lo = rin->rc_ex.ex_lo;
if (rin->rc_minor_epc != VOS_SUB_OP_MAX)
rout->rd_minor_epc = rin->rc_minor_epc;
else
rout->rd_minor_epc = EVT_TX_MINOR_MAX_DF;
};

#define DF_BUF_LEN 128
Expand Down Expand Up @@ -159,10 +165,20 @@ time_cmp(uint64_t t1, uint64_t t2, int *out)
return true;
}

if (t1 < t2)
if (t1 < t2) {
/** Even if t2 is rebuild, overwrite just works here */
*out = RT_OVERLAP_OVER;
else
} else {
*out = RT_OVERLAP_UNDER;
if (t2 == EVT_REBUILD_MINOR_MIN) {
/** If t1 is also a rebuild, we should return
* RT_OVERLAP_SAME to force adjustment of new rebuild
* minor epoch.
*/
if (t1 < EVT_REBUILD_MINOR_MAX)
*out = RT_OVERLAP_SAME;
}
}

return false;
}
Expand Down Expand Up @@ -2249,6 +2265,7 @@ evt_insert(daos_handle_t toh, const struct evt_entry_in *entry,
struct evt_filter filter;
int rc;
int alt_rc = 0;
bool overwrite = false;

tcx = evt_hdl2tcx(toh);
if (tcx == NULL)
Expand Down Expand Up @@ -2279,6 +2296,13 @@ evt_insert(daos_handle_t toh, const struct evt_entry_in *entry,

evt_ent_array_init(ent_array, 1);

/* For evt_remove_all, we only insert removals for things that
* are visible, so we shouldn't need to check for overwrite at
* all.
*/
if (entry->ei_rect.rc_minor_epc == EVT_MINOR_EPC_MAX)
goto run_tx;

filter.fr_ex = entry->ei_rect.rc_ex;
filter.fr_epr.epr_lo = entry->ei_rect.rc_epc;
filter.fr_epr.epr_hi = entry->ei_bound;
Expand All @@ -2293,61 +2317,41 @@ evt_insert(daos_handle_t toh, const struct evt_entry_in *entry,
return rc;

if (ent_array->ea_ent_nr == 1) {
if (entry->ei_rect.rc_minor_epc == EVT_MINOR_EPC_MAX) {
/** Special case. This is an overlapping delete record
* which can happen when there are minor epochs
* involved. Rather than rejecting, insert prefix
* and/or suffix extents.
ent = evt_ent_array_get(ent_array, 0);
if (entry->ei_rect.rc_minor_epc == EVT_REBUILD_MINOR_MIN) {
ent_cpy = *entry;
ent_cpy.ei_rect.rc_minor_epc = ent->en_minor_epc + 1;
D_ASSERTF(ent_cpy.ei_rect.rc_minor_epc <= EVT_REBUILD_MINOR_MAX,
"minor_epc=%d\n", ent_cpy.ei_rect.rc_minor_epc);
/* This should never happen because otherwise, we would
* return no overlap.
*/
ent = evt_ent_array_get(ent_array, 0);
if (ent->en_ext.ex_lo <= entry->ei_rect.rc_ex.ex_lo &&
ent->en_ext.ex_hi >= entry->ei_rect.rc_ex.ex_hi) {
/** Nothing to do, existing extent contains
* the new one
*/
return 0;
}
D_ASSERTF(ent_cpy.ei_rect.rc_minor_epc > EVT_REBUILD_MINOR_MIN,
"minor_epc=%d\n", ent_cpy.ei_rect.rc_minor_epc);
entryp = &ent_cpy;
goto run_tx;
}
overwrite = true;
}

run_tx:
rc = evt_tx_begin(tcx);
if (rc != 0)
return rc;

if (tcx->tc_depth == 0) { /* empty tree */
rc = evt_root_activate(tcx, entry);
rc = evt_root_activate(tcx, entryp);
if (rc != 0)
goto out;
} else if (tcx->tc_inob == 0 && entry->ei_inob != 0) {
} else if (tcx->tc_inob == 0 && entryp->ei_inob != 0) {
rc = evt_root_tx_add(tcx);
if (rc != 0)
goto out;
tcx->tc_inob = tcx->tc_root->tr_inob = entry->ei_inob;
tcx->tc_inob = tcx->tc_root->tr_inob = entryp->ei_inob;
}

D_ASSERT(ent_array->ea_ent_nr <= 1);
if (ent_array->ea_ent_nr == 1) {
if (ent != NULL) {
memcpy(&ent_cpy, entry, sizeof(*entry));
entryp = &ent_cpy;
/** We need to edit the existing extent */
if (entry->ei_rect.rc_ex.ex_lo < ent->en_ext.ex_lo) {
ent_cpy.ei_rect.rc_ex.ex_hi = ent->en_ext.ex_lo - 1;
if (entry->ei_rect.rc_ex.ex_hi <= ent->en_ext.ex_hi)
goto insert;
/* There is also a suffix, so insert the prefix */
rc = evt_insert_entry(tcx, entryp, csum_bufp);
if (rc != 0)
goto out;
}

D_ASSERT(entry->ei_rect.rc_ex.ex_hi > ent->en_ext.ex_hi);
ent_cpy.ei_rect.rc_ex.ex_hi = entry->ei_rect.rc_ex.ex_hi;
ent_cpy.ei_rect.rc_ex.ex_lo = ent->en_ext.ex_hi + 1;

/* Now insert the suffix */
goto insert;
}
if (overwrite) {
/*
* NB: This is part of the current hack to keep "supporting"
* overwrite for same epoch, full overwrite.
Expand All @@ -2358,7 +2362,6 @@ evt_insert(daos_handle_t toh, const struct evt_entry_in *entry,
goto out;
}

insert:
/* Phase-2: Inserting */
rc = evt_insert_entry(tcx, entryp, csum_bufp);

Expand Down Expand Up @@ -2584,7 +2587,7 @@ evt_ent_array_fill(struct evt_context *tcx, enum evt_find_opc find_opc,
int at;
int i;
int rc = 0;
bool has_agg = false;
bool has_agg = false;

V_TRACE(DB_TRACE, "Searching rectangle "DF_RECT" opc=%d\n",
DP_RECT(rect), find_opc);
Expand Down Expand Up @@ -2704,27 +2707,29 @@ evt_ent_array_fill(struct evt_context *tcx, enum evt_find_opc find_opc,
" overlaps with " DF_RECT "\n",
DP_RECT(&rtmp), DP_RECT(rect));

/* NB: This is temporary to allow full overwrite
* in same epoch to avoid breaking rebuild.
* Without some sequence number and client
* identifier, we can't do this robustly.
* There can be a race between rebuild and
* client doing different updates. But this
* isn't any worse than what we already have in
* place so I did it this way to minimize
* change while we decide how to handle this
* properly.
*/
if (range_overlap != RT_OVERLAP_SAME) {
D_ERROR("Same epoch partial "
"overwrite not supported:"
DF_RECT" overlaps with "DF_RECT
"\n", DP_RECT(rect),
DP_RECT(&rtmp));
rc = -DER_VOS_PARTIAL_UPDATE;
goto out;
if (rect->rc_minor_epc != EVT_REBUILD_MINOR_MIN) {
if (range_overlap != RT_OVERLAP_SAME) {
D_ERROR("Same epoch partial overwrite not "
"supported: " DF_RECT
" overlaps with " DF_RECT "\n",
DP_RECT(rect), DP_RECT(&rtmp));
rc = -DER_VOS_PARTIAL_UPDATE;
goto out;
}
}
break; /* we can update the record in place */

if (ent_array->ea_ent_nr == 0)
break;

/** If the extent is an overlap, partial or
* otherwise, we want to keep only the one with
* the highest minor epoch and let the caller
* deal with it.
*/
ent = evt_ent_array_get(ent_array, 0);
if (ent->en_minor_epc > rtmp.rc_minor_epc)
continue;
goto replace_ent;
case EVT_FIND_SAME:
if (range_overlap != RT_OVERLAP_SAME)
continue;
Expand Down Expand Up @@ -2762,6 +2767,7 @@ evt_ent_array_fill(struct evt_context *tcx, enum evt_find_opc find_opc,
goto out;
}

replace_ent:
evt_entry_fill(tcx, node, i, rect, intent, ent);
switch (find_opc) {
default:
Expand Down
6 changes: 4 additions & 2 deletions src/vos/tests/evt_ctl.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/bin/bash

if [ "$USE_VALGRIND" = "memcheck" ]; then
VCMD="valgrind --leak-check=full --show-reachable=yes --error-limit=no \
if [ "$USE_VALGRIND" = "memcheck" ]; then VCMD="valgrind --leak-check=full --show-reachable=yes --error-limit=no \
--suppressions=${VALGRIND_SUPP} --error-exitcode=42 --xml=yes \
--xml-file=unit-test-evt_ctl-%p.memcheck.xml"
elif [ "$USE_VALGRIND" = "pmemcheck" ]; then
Expand Down Expand Up @@ -167,6 +166,9 @@ cmd+=" -C o:5 -a 0-1@1:ab -a 1-2@2:cd -a 3-4@3:bc -a 5-7@4:def -a 6-8@5:xyz"
cmd+=" -a 1-2@6:aa -a 4-7@7:abcd -b -2 -r 0-5@8 -b -2 -r 0-5@3 -b -2 -D"
cmd+=" -C o:4 -a [email protected]:abcdef -a [email protected]:fedcba -a [email protected]:foo -r 2-8@0-1"
cmd+=" -l0-10@0-10:c -r 7-9@0-1 -l0-10@0-10:C -b -2 -D"
cmd+=" -C o:15 -a [email protected]:abcdef -a [email protected]:ff -a [email protected]:abc -a [email protected]:ab"
cmd+=" -a [email protected]:abc -a [email protected]:vab -a [email protected]:ab -a [email protected]:a"
cmd+=" -b -2 -D"
echo "$cmd"
eval "$cmd"
result="${PIPESTATUS[0]}"
Expand Down
3 changes: 1 addition & 2 deletions src/vos/vos_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@
#include "vos_ilog.h"
#include "vos_obj.h"

#define VOS_MINOR_EPC_MAX (VOS_SUB_OP_MAX + 1)
D_CASSERT(VOS_MINOR_EPC_MAX == EVT_MINOR_EPC_MAX);
#define VOS_MINOR_EPC_MAX EVT_MINOR_EPC_MAX

#define VOS_TX_LOG_FAIL(rc, ...) \
do { \
Expand Down
26 changes: 13 additions & 13 deletions src/vos/vos_io.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,17 +67,11 @@ struct vos_io_context {
/** the total size of the IO */
uint64_t ic_io_size;
/** flags */
unsigned int ic_update:1,
ic_size_fetch:1,
ic_save_recx:1,
ic_dedup:1, /** candidate for dedup */
ic_dedup_verify:1,
ic_read_ts_only:1,
ic_check_existence:1,
ic_remove:1,
ic_skip_fetch:1,
ic_agg_needed:1,
ic_ec:1; /**< see VOS_OF_EC */
unsigned int ic_update : 1, ic_size_fetch : 1, ic_save_recx : 1,
ic_dedup : 1, /** candidate for dedup */
ic_dedup_verify : 1, ic_read_ts_only : 1, ic_check_existence : 1, ic_remove : 1,
ic_skip_fetch : 1, ic_agg_needed : 1, ic_skip_akey_support : 1, ic_rebuild : 1,
ic_ec : 1; /**< see VOS_OF_EC */
/**
* Input shadow recx lists, one for each iod. Now only used for degraded
* mode EC obj fetch handling.
Expand Down Expand Up @@ -639,6 +633,7 @@ vos_ioc_create(daos_handle_t coh, daos_unit_oid_t oid, bool read_only,
ioc->ic_read_ts_only = 1;
ioc->ic_remove = ((vos_flags & VOS_OF_REMOVE) != 0);
ioc->ic_ec = ((vos_flags & VOS_OF_EC) != 0);
ioc->ic_rebuild = ((vos_flags & VOS_OF_REBUILD) != 0);
ioc->ic_umoffs_cnt = ioc->ic_umoffs_at = 0;
ioc->ic_iod_csums = iod_csums;
vos_ilog_fetch_init(&ioc->ic_dkey_info);
Expand Down Expand Up @@ -2296,6 +2291,7 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err,
struct vos_io_context *ioc = vos_ioh2ioc(ioh);
struct umem_instance *umem;
bool tx_started = false;
uint16_t minor_epc;

D_ASSERT(ioc->ic_update);
vos_dedup_verify_fini(ioh);
Expand Down Expand Up @@ -2340,9 +2336,13 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err,
if (err != 0)
goto abort;

if (dtx_is_valid_handle(dth))
minor_epc = dth->dth_op_seq;
else
minor_epc = ioc->ic_rebuild ? EVT_REBUILD_MINOR_MIN : VOS_SUB_OP_MAX;

/* Update tree index */
err = dkey_update(ioc, pm_ver, dkey, dtx_is_valid_handle(dth) ?
dth->dth_op_seq : VOS_SUB_OP_MAX);
err = dkey_update(ioc, pm_ver, dkey, minor_epc);
if (err) {
VOS_TX_LOG_FAIL(err, "Failed to update tree index: "DF_RC"\n",
DP_RC(err));
Expand Down
Loading
Loading