diff --git a/cmd/sentinel/cmd/sentinel.go b/cmd/sentinel/cmd/sentinel.go index 100e381bb..541f6f09f 100644 --- a/cmd/sentinel/cmd/sentinel.go +++ b/cmd/sentinel/cmd/sentinel.go @@ -98,6 +98,12 @@ func (s *Sentinel) electionLoop(ctx context.Context) { case err := <-errCh: if err != nil { log.Errorw("election loop error", zap.Error(err)) + + // It's important to Stop() any on-going elections, as most stores will block + // until all previous elections have completed. If we continue without stopping, + // we run the risk of preventing any subsequent elections from successfully + // electing a leader. + s.election.Stop() } goto end case <-ctx.Done(): diff --git a/internal/store/etcdv3.go b/internal/store/etcdv3.go index 135782f2c..895e722e0 100644 --- a/internal/store/etcdv3.go +++ b/internal/store/etcdv3.go @@ -199,14 +199,12 @@ func (e *etcdv3Election) campaign() { e.electedCh <- false s, err := concurrency.NewSession(e.c, concurrency.WithTTL(int(e.ttl.Seconds())), concurrency.WithContext(e.ctx)) if err != nil { - e.running = false e.errCh <- err return } etcdElection := concurrency.NewElection(s, e.path) if err = etcdElection.Campaign(e.ctx, e.candidateUID); err != nil { - e.running = false e.errCh <- err return } @@ -215,11 +213,8 @@ func (e *etcdv3Election) campaign() { select { case <-e.ctx.Done(): - e.running = false - etcdElection.Resign(context.TODO()) return case <-s.Done(): - etcdElection.Resign(context.TODO()) e.electedCh <- false } } diff --git a/internal/store/store.go b/internal/store/store.go index 6becbb4e7..5776fa56e 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -47,6 +47,11 @@ type Election interface { // TODO(sgotti) this mimics the current docker/leadership API and the etcdv3 // implementations adapt to it. In future it could be replaced with a better // api like the current one implemented by etcdclientv3/concurrency. + // + // WARNING: If the election error channel receives any error, it is vital that + // the consuming code calls election.Stop(). Failure to do so can cause + // subsequent elections to hang indefinitely across all participants of an + // election. RunForElection() (<-chan bool, <-chan error) Leader() (string, error) Stop()