Skip to content

Commit 5328718

Browse files
Fix leader election edge-cases (#373)
1 parent e9db7b6 commit 5328718

File tree

4 files changed

+33
-27
lines changed

4 files changed

+33
-27
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ go 1.20
44

55
require (
66
github.com/arangodb-helper/go-certificates v0.0.0-20180821055445-9fca24fc2680
7-
github.com/arangodb-helper/go-helper v0.2.1
7+
github.com/arangodb-helper/go-helper v0.4.1
88
github.com/arangodb/go-driver v1.6.0
99
github.com/arangodb/go-upgrade-rules v0.0.0-20200605091205-439fb1ee86e7
1010
github.com/cenkalti/backoff v2.2.1+incompatible

go.sum

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@ github.com/arangodb-helper/go-helper v0.2.0 h1:rpkX0msiMVorxPJygR2R5s67RjQdFEIGm
1515
github.com/arangodb-helper/go-helper v0.2.0/go.mod h1:RHgEwQTFWdJ9wFDGUCgUZzaz9NLaFUskSsHgOPM5XR4=
1616
github.com/arangodb-helper/go-helper v0.2.1 h1:hwh/Nbkce3FHtegKYKzfGIhlGI5bRaDMpjQCRKMG55A=
1717
github.com/arangodb-helper/go-helper v0.2.1/go.mod h1:RHgEwQTFWdJ9wFDGUCgUZzaz9NLaFUskSsHgOPM5XR4=
18+
github.com/arangodb-helper/go-helper v0.4.1-0.20230713102015-83a33422c87a h1:TsR6WxBzhwTJyYXyWedrzUdmLL/MnIw18wJVPT8V0HI=
19+
github.com/arangodb-helper/go-helper v0.4.1-0.20230713102015-83a33422c87a/go.mod h1:RHgEwQTFWdJ9wFDGUCgUZzaz9NLaFUskSsHgOPM5XR4=
20+
github.com/arangodb-helper/go-helper v0.4.1-0.20230713105633-bc4e0cf3a627 h1:gmZ3WBqiQGPvRtKBUoWvFSwb2eXIg+X1MaNMoSG+CwY=
21+
github.com/arangodb-helper/go-helper v0.4.1-0.20230713105633-bc4e0cf3a627/go.mod h1:RHgEwQTFWdJ9wFDGUCgUZzaz9NLaFUskSsHgOPM5XR4=
22+
github.com/arangodb-helper/go-helper v0.4.1 h1:yO4Bu5AhuvenDe5AmSWRo/ya/GI1lmVA+BYrRT9umsI=
23+
github.com/arangodb-helper/go-helper v0.4.1/go.mod h1:RHgEwQTFWdJ9wFDGUCgUZzaz9NLaFUskSsHgOPM5XR4=
1824
github.com/arangodb/go-driver v1.6.0 h1:NFWj/idqXZxhFVueihMSI2R9NotNIsgvNfM/xmpekb4=
1925
github.com/arangodb/go-driver v1.6.0/go.mod h1:HQmdGkvNMVBTE3SIPSQ8T/ZddC6iwNsfMR+dDJQxIsI=
2026
github.com/arangodb/go-upgrade-rules v0.0.0-20200605091205-439fb1ee86e7 h1:zCY5fsv5apos+oAdd1bLr1UEFOHeIUDZCItbwU/u6XE=

service/runtime_cluster_manager.go

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -106,18 +106,17 @@ func (s *runtimeClusterManager) updateClusterConfiguration(ctx context.Context,
106106
return nil
107107
}
108108

109-
func (s *runtimeClusterManager) runLeaderElection(ctx context.Context, agencyClient agency.Agency, myURL string) {
110-
le := election.NewLeaderElectionCell[string](agencyClient, masterURLKey, masterURLTTL)
109+
func (s *runtimeClusterManager) runLeaderElection(ctx context.Context, myURL string) {
110+
le := election.NewLeaderElectionCell[string](masterURLKey, masterURLTTL)
111111

112-
var err error
113-
var delay time.Duration
112+
delay := time.Microsecond
114113
resignErrBackoff := backoff.NewExponentialBackOff()
115114
for {
116115
timer := time.NewTimer(delay)
117116
// Wait a bit
118117
select {
119118
case <-timer.C:
120-
// Delay over, just continue
119+
// Delay over, just continue
121120
case <-ctx.Done():
122121
// We're asked to stop
123122
if !timer.Stop() {
@@ -126,11 +125,18 @@ func (s *runtimeClusterManager) runLeaderElection(ctx context.Context, agencyCli
126125
return
127126
}
128127

128+
agencyClient, err := s.createAgencyAPI()
129+
if err != nil {
130+
delay = time.Second
131+
s.log.Debug().Err(err).Msgf("could not create agency client. Retrying in %s", delay)
132+
continue
133+
}
134+
129135
oldMasterURL := s.GetMasterURL()
130136
if s.avoidBeingMaster {
131137
if oldMasterURL == "" {
132138
s.log.Debug().Msg("Initializing master URL before resigning")
133-
currMasterURL, err := le.Read(ctx)
139+
currMasterURL, err := le.Read(ctx, agencyClient)
134140
if err != nil {
135141
delay = 5 * time.Second
136142
s.log.Err(err).Msgf("Failed to read current value before resigning. Retrying in %s", delay)
@@ -140,7 +146,7 @@ func (s *runtimeClusterManager) runLeaderElection(ctx context.Context, agencyCli
140146
}
141147

142148
s.log.Debug().Str("master_url", myURL).Msgf("Resigning leadership")
143-
err = le.Resign(ctx)
149+
err = le.Resign(ctx, agencyClient)
144150
if err != nil {
145151
delay = resignErrBackoff.NextBackOff()
146152
s.log.Err(err).Msgf("Resigning leadership failed. Retrying in %s", delay)
@@ -157,7 +163,7 @@ func (s *runtimeClusterManager) runLeaderElection(ctx context.Context, agencyCli
157163
s.log.Debug().
158164
Str("master_url", myURL).
159165
Msg("Updating leadership")
160-
masterURL, isMaster, delay, err = le.Update(ctx, myURL)
166+
masterURL, isMaster, delay, err = le.Update(ctx, agencyClient, myURL)
161167
if err != nil {
162168
delay = 5 * time.Second
163169
s.log.Error().Err(err).Msgf("Update leader election failed. Retrying in %s", delay)
@@ -166,16 +172,15 @@ func (s *runtimeClusterManager) runLeaderElection(ctx context.Context, agencyCli
166172
if isMaster && masterURL != myURL {
167173
s.log.Error().Msgf("Unexpected error: this peer is a master but URL differs. Should be %s got %s", myURL, masterURL)
168174
}
175+
if !isMaster && masterURL == myURL {
176+
s.log.Error().Msgf("Unexpected error: this peer is not a master but URL in agency is mine")
177+
}
169178

170179
s.updateMasterURL(masterURL, isMaster)
171180
}
172181
}
173182

174183
func (s *runtimeClusterManager) updateMasterURL(masterURL string, isMaster bool) {
175-
s.log.Debug().
176-
Str("new_master_url", masterURL).
177-
Bool("is_master", isMaster).
178-
Msg("Leadership updated")
179184
newState := stateRunningSlave
180185
if isMaster {
181186
newState = stateRunningMaster
@@ -215,16 +220,11 @@ func (s *runtimeClusterManager) Run(ctx context.Context, log zerolog.Logger, run
215220
return
216221
}
217222

218-
agencyClient, err := s.createAgencyAPI()
219-
if err != nil {
220-
log.Error().Msg("Could not create agency API client")
221-
return
222-
}
223223
ownURL := myPeer.CreateStarterURL("/")
224-
go s.runLeaderElection(ctx, agencyClient, ownURL)
224+
go s.runLeaderElection(ctx, ownURL)
225225

226226
for {
227-
var delay time.Duration
227+
delay := time.Microsecond
228228
// Loop until stopping
229229
if ctx.Err() != nil {
230230
// Stop requested
@@ -243,7 +243,7 @@ func (s *runtimeClusterManager) Run(ctx context.Context, log zerolog.Logger, run
243243
delay = time.Second * 15
244244
}
245245
} else {
246-
// we are still leading, check again later
246+
// we are still leading or not initialized, check again later
247247
delay = time.Second * 5
248248
}
249249

service/upgrade_manager.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -304,22 +304,22 @@ func (m *upgradeManager) StartDatabaseUpgrade(ctx context.Context, forceMinorUpg
304304
return maskAny(err)
305305
}
306306
m.log.Debug().Msg("Creating lock")
307-
lock, err := election.NewLock(m, api, upgradeManagerLockKey, "", upgradeManagerLockTTL)
307+
lock, err := election.NewLock(m, upgradeManagerLockKey, "", upgradeManagerLockTTL)
308308
if err != nil {
309309
return maskAny(err)
310310
}
311311

312312
// Claim the upgrade lock
313313
m.log.Debug().Msg("Locking lock")
314-
if err := lock.Lock(ctx); err != nil {
314+
if err := lock.Lock(ctx, api); err != nil {
315315
m.log.Debug().Err(err).Msg("Lock failed")
316316
return maskAny(err)
317317
}
318318

319319
// Close agency lock when we're done
320320
defer func() {
321321
m.log.Debug().Msg("Unlocking lock")
322-
lock.Unlock(context.Background())
322+
lock.Unlock(context.Background(), api)
323323
}()
324324

325325
m.log.Debug().Msg("Reading upgrade plan...")
@@ -547,22 +547,22 @@ func (m *upgradeManager) AbortDatabaseUpgrade(ctx context.Context) error {
547547
return maskAny(err)
548548
}
549549
m.log.Debug().Msg("Creating lock")
550-
lock, err := election.NewLock(m, api, upgradeManagerLockKey, "", upgradeManagerLockTTL)
550+
lock, err := election.NewLock(m, upgradeManagerLockKey, "", upgradeManagerLockTTL)
551551
if err != nil {
552552
return maskAny(err)
553553
}
554554

555555
// Claim the upgrade lock
556556
m.log.Debug().Msg("Locking lock")
557-
if err := lock.Lock(ctx); err != nil {
557+
if err := lock.Lock(ctx, api); err != nil {
558558
m.log.Debug().Err(err).Msg("Lock failed")
559559
return maskAny(err)
560560
}
561561

562562
// Close agency lock when we're done
563563
defer func() {
564564
m.log.Debug().Msg("Unlocking lock")
565-
lock.Unlock(context.Background())
565+
lock.Unlock(context.Background(), api)
566566
}()
567567

568568
// Check plan

0 commit comments

Comments
 (0)