handle unexpected successor exit

This commit is contained in:
Branden J Brown 2025-03-14 09:39:57 -04:00
parent 56c1ad3737
commit 4c79714fcd
2 changed files with 41 additions and 0 deletions

View File

@ -113,6 +113,46 @@ func (n *Node) SetLocal(k ID, v string) bool {
return false
}
// SuccessorFailed marks the node's current successor as having failed, e.g.
// during stabilization.
func (n *Node) SuccessorFailed() {
// TODO(branden): probably most of this will need rewritten once
// replication and fingers are implemented
n.mu.Lock()
defer n.mu.Unlock()
old := n.succ[0]
// If there are only two nodes in the network, the predecessor is also the
// successor, which means it too has failed. Clear it in that case.
if old.addr == n.pred.addr {
n.pred = Peer{}
}
if len(n.succ) > 1 {
// The successor list has replication. Just shift it down.
copy(n.succ, n.succ[1:])
n.succ = n.succ[:len(n.succ)-1]
} else {
// Check the finger table.
for _, f := range n.fingers {
if f.addr != n.self.addr {
// TODO(branden): ensure the list stays sorted
n.succ[0] = f
break
}
}
}
if n.succ[0] != old {
return
}
// We couldn't find a new successor in either the successor list or the
// finger table. The only other candidate we have is our predecessor.
// Stabilization will eventually work us out even if that's wrong.
if n.pred.IsValid() {
n.succ[0] = n.pred
return
}
n.succ[0] = n.self
}
// Peer is the ID and address of a node.
type Peer struct {
id ID

View File

@ -181,6 +181,7 @@ func cliJoin(ctx context.Context, cmd *cli.Command) error {
for range t.C {
if err := chord.Stabilize(ctx, cl, node); err != nil {
slog.ErrorContext(ctx, "stabilize", slog.Any("err", err))
node.SuccessorFailed()
}
}
}()