Skip to content

Commit

Permalink
Fix subtle bug in state catchup protocol
Browse files Browse the repository at this point in the history
Before, nodes from the current membership were monitored for the need
for state transmission, and the latest stable checkpoint was being sent
to them. This sometimes resulted in sending checkpoints to nodes that
were not yet in the transmitted checkpoint's membership, causing them
to get stuck.

This has been fixed by making sure we only send a checkpoint to a node
if the node is in that checkpoint's membership, and by the receiving
nodes ignoring checkpoints if they are not included in the corresponding
membership (such checkpoints can always be sent by faulty nodes).

Signed-off-by: Matej Pavlovic <[email protected]>
  • Loading branch information
matejpavlovic committed Apr 17, 2023
1 parent bf95ef0 commit c716023
Showing 1 changed file with 23 additions and 5 deletions.
28 changes: 23 additions & 5 deletions pkg/iss/iss.go
Original file line number Diff line number Diff line change
Expand Up @@ -368,20 +368,29 @@ func New(
})

isspbdsl.UponPushCheckpoint(iss.m, func() error {

// Send the latest stable checkpoint to potentially
// delayed nodes. The set of nodes to send the latest
// stable checkpoint to is determined based on the
// highest epoch number in the messages received from
// highest epoch number in the checkpoint messages received from
// the node so far. If the highest epoch number we
// have heard from the node is more than params.RetainedEpochs
// behind, then that node is likely to be left behind
// and needs the stable checkpoint in order to start
// catching up with state transfer.

// Convenience variables
epoch := iss.lastStableCheckpoint.Epoch()
membership := iss.lastStableCheckpoint.Memberships()[0] // Membership of the epoch started by th checkpoint

// We loop through the checkpoint's first epoch's membership
// and not through the membership this replica is currently in.
// (Which might differ if the replica already advanced to a new epoch,
// but hasn't obtained its starting checkpoint yet.)
// This is required to avoid sending old checkpoints to replicas
// that are not yet part of the system for those checkpoints.
var delayed []t.NodeID
lastStableCheckpointEpoch := iss.lastStableCheckpoint.Epoch()
for n := range iss.epoch.Membership {
if lastStableCheckpointEpoch > iss.nodeEpochMap[n]+t.EpochNr(iss.Params.RetainedEpochs) {
for n := range membership {
if epoch > iss.nodeEpochMap[n]+t.EpochNr(iss.Params.RetainedEpochs) {
delayed = append(delayed, n)
}
}
Expand Down Expand Up @@ -417,6 +426,15 @@ func New(
return nil
}

// Ignore checkpoint if we are not part of its membership
// (more precisely, membership of the epoch the checkpoint is at the start of).
// Correct nodes should never send such checkpoints, but faulty ones could.
if _, ok := chkp.Memberships()[0][iss.ownID]; !ok {
iss.logger.Log(logging.LevelWarn, "Ignoring checkpoint. Not in membership.",
"sender", sender, "memberships", chkp.Memberships())
return nil
}

chkpMembership := chkp.PreviousMembership() // TODO this is wrong and it is a vulnerability, come back to fix after discussion (issue #384)
if chkpMembershipOffset > iss.Params.ConfigOffset {
// cannot verify checkpoint signatures, too far ahead
Expand Down

0 comments on commit c716023

Please sign in to comment.