@@ -22,57 +22,85 @@ package agency
22
22
23
23
import (
24
24
"context"
25
- "fmt"
26
25
"sync"
27
- "time"
28
26
29
27
"github.com/arangodb/go-driver/agency"
30
28
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
31
29
"github.com/arangodb/kube-arangodb/pkg/util/errors"
30
+ "github.com/arangodb/kube-arangodb/pkg/util/globals"
32
31
)
33
32
34
33
type health struct {
35
34
leaderID string
36
35
36
+ agencySize int
37
+
38
+ names []string
37
39
commitIndexes map [string ]uint64
40
+ leaders map [string ]string
41
+ election map [string ]int
38
42
}
39
43
40
44
func (h health ) LeaderID () string {
41
45
return h .leaderID
42
46
}
43
47
44
- // IsHealthy returns true if all agencies have the same commit index.
45
- // Returns false when:
46
- // - agencies' list is empty.
47
- // - agencies have different commit indices.
48
- // - agencies have commit indices == 0.
49
- func (h health ) IsHealthy () bool {
50
- var globalCommitIndex uint64
51
- first := true
52
-
53
- for _ , commitIndex := range h .commitIndexes {
54
- if first {
55
- globalCommitIndex = commitIndex
56
- first = false
57
- } else if commitIndex != globalCommitIndex {
58
- return false
48
+ // Healthy returns nil if all agencies have the same commit index.
49
+ func (h health ) Healthy () error {
50
+ if err := h .Serving (); err != nil {
51
+ return err
52
+ }
53
+
54
+ if h .election [h .leaderID ] != h .agencySize {
55
+ return errors .Newf ("Not all agents are in quorum" )
56
+ }
57
+
58
+ index := h .commitIndexes [h .leaderID ]
59
+ if index == 0 {
60
+ return errors .Newf ("Agency CommitIndex is zero" )
61
+ }
62
+
63
+ for k , v := range h .commitIndexes {
64
+ if v != index {
65
+ return errors .Newf ("Agent %s is behind in CommitIndex" , k )
59
66
}
60
67
}
61
68
62
- return globalCommitIndex != 0
69
+ return nil
70
+ }
71
+
72
+ func (h health ) Serving () error {
73
+ if h .agencySize == 0 {
74
+ return errors .Newf ("Empty agents list" )
75
+ }
76
+
77
+ if len (h .election ) == 0 {
78
+ return errors .Newf ("No Leader" )
79
+ } else if len (h .election ) > 1 {
80
+ return errors .Newf ("Multiple leaders" )
81
+ }
82
+
83
+ if len (h .leaders ) <= h .agencySize / 2 {
84
+ return errors .Newf ("Quorum is not present" )
85
+ }
86
+
87
+ return nil
63
88
}
64
89
65
90
// Health describes interface to check healthy of the environment.
66
91
type Health interface {
67
- // IsHealthy return true when environment is considered as healthy.
68
- IsHealthy () bool
92
+ // Healthy return nil when environment is considered as healthy.
93
+ Healthy () error
94
+
95
+ // Serving return nil when environment is considered as responsive, but not fully healthy.
96
+ Serving () error
69
97
70
98
// LeaderID returns a leader ID or empty string if a leader is not known.
71
99
LeaderID () string
72
100
}
73
101
74
102
type Cache interface {
75
- Reload (ctx context.Context , clients []agency.Agency ) (uint64 , error )
103
+ Reload (ctx context.Context , size int , clients []agency.Agency ) (uint64 , error )
76
104
Data () (State , bool )
77
105
CommitIndex () uint64
78
106
// Health returns true when healthy object is available.
@@ -107,7 +135,7 @@ func (c cacheSingle) Health() (Health, bool) {
107
135
return nil , false
108
136
}
109
137
110
- func (c cacheSingle ) Reload (_ context.Context , _ []agency.Agency ) (uint64 , error ) {
138
+ func (c cacheSingle ) Reload (_ context.Context , _ int , _ []agency.Agency ) (uint64 , error ) {
111
139
return 0 , nil
112
140
}
113
141
@@ -153,15 +181,16 @@ func (c *cache) Health() (Health, bool) {
153
181
return nil , false
154
182
}
155
183
156
- func (c * cache ) Reload (ctx context.Context , clients []agency.Agency ) (uint64 , error ) {
184
+ func (c * cache ) Reload (ctx context.Context , size int , clients []agency.Agency ) (uint64 , error ) {
157
185
c .lock .Lock ()
158
186
defer c .lock .Unlock ()
159
187
160
- leaderCli , leaderConfig , health , err := getLeader (ctx , clients )
188
+ leaderCli , leaderConfig , health , err := getLeader (ctx , size , clients )
161
189
if err != nil {
162
190
// Invalidate a leader ID and agency state.
163
191
// In the next iteration leaderID will be sat because `valid` will be false.
164
192
c .valid = false
193
+ c .health = nil
165
194
166
195
return 0 , err
167
196
}
@@ -186,91 +215,62 @@ func (c *cache) Reload(ctx context.Context, clients []agency.Agency) (uint64, er
186
215
187
216
// getLeader returns config and client to a leader agency, and health to check if agencies are on the same page.
188
217
// If there is no quorum for the leader then error is returned.
189
- func getLeader (ctx context.Context , clients []agency.Agency ) (agency.Agency , * Config , Health , error ) {
190
- var mutex sync.Mutex
191
- var anyError error
192
- var wg sync.WaitGroup
218
+ func getLeader (ctx context.Context , size int , clients []agency.Agency ) (agency.Agency , * Config , Health , error ) {
219
+ configs := make ([]* Config , len (clients ))
220
+ errs := make ([]error , len (clients ))
193
221
194
- cliLen := len (clients )
195
- if cliLen == 0 {
196
- return nil , nil , nil , errors .New ("empty list of agencies' clients" )
197
- }
198
- configs := make ([]* Config , cliLen )
199
- leaders := make (map [string ]int , cliLen )
200
-
201
- var h health
222
+ var wg sync.WaitGroup
202
223
203
- h .commitIndexes = make (map [string ]uint64 , cliLen )
204
- // Fetch all configs from agencies.
205
- wg .Add (cliLen )
206
- for i , cli := range clients {
207
- go func (iLocal int , cliLocal agency.Agency ) {
224
+ // Fetch Agency config
225
+ for i := range clients {
226
+ wg .Add (1 )
227
+ go func (id int ) {
208
228
defer wg .Done ()
209
229
210
- ctxLocal , cancel := context . WithTimeout (ctx , time . Second )
230
+ ctxLocal , cancel := globals . GetGlobals (). Timeouts (). Agency (). WithTimeout (ctx )
211
231
defer cancel ()
212
- config , err := GetAgencyConfig (ctxLocal , cliLocal )
213
-
214
- mutex .Lock ()
215
- defer mutex .Unlock ()
232
+ config , err := GetAgencyConfig (ctxLocal , clients [id ])
216
233
217
234
if err != nil {
218
- anyError = err
219
- return
220
- } else if config == nil || config .LeaderId == "" {
221
- anyError = fmt .Errorf ("leader unknown for the agent %v" , cliLocal .Connection ().Endpoints ())
235
+ errs [id ] = err
222
236
return
223
237
}
224
238
225
239
// Write config on the same index where client is (It will be helpful later).
226
- configs [iLocal ] = config
227
- // Count leaders.
228
- leaders [config .LeaderId ]++
229
- h .commitIndexes [config .Configuration .ID ] = config .CommitIndex
230
- }(i , cli )
240
+ configs [id ] = config
241
+ }(i )
231
242
}
232
243
wg .Wait ()
233
244
234
- if anyError != nil {
235
- return nil , nil , nil , wrapError (anyError , "not all agencies are responsive" )
236
- }
237
-
238
- if len (leaders ) == 0 {
239
- return nil , nil , nil , wrapError (anyError , "failed to get config from agencies" )
240
- }
241
-
242
- // Find the leader ID which has the most votes from all agencies.
243
- maxVotes := 0
244
- var leaderID string
245
- for id , votes := range leaders {
246
- if votes > maxVotes {
247
- maxVotes = votes
248
- leaderID = id
245
+ var h health
246
+ h .agencySize = size
247
+ h .names = make ([]string , len (clients ))
248
+ h .commitIndexes = make (map [string ]uint64 , len (clients ))
249
+ h .leaders = make (map [string ]string , len (clients ))
250
+ h .election = make (map [string ]int , len (clients ))
251
+
252
+ for id := range configs {
253
+ if config := configs [id ]; config != nil {
254
+ name := config .Configuration .ID
255
+ h .names [id ] = name
256
+ h .commitIndexes [name ] = config .CommitIndex
257
+ if config .LeaderId != "" {
258
+ h .leaders [name ] = config .LeaderId
259
+ h .election [config .LeaderId ]++
260
+ h .leaderID = config .LeaderId
261
+ }
249
262
}
250
263
}
251
264
252
- h .leaderID = leaderID
253
-
254
- // Check if a leader has quorum from all possible agencies.
255
- if maxVotes <= cliLen / 2 {
256
- message := fmt .Sprintf ("no quorum for leader %s, votes %d of %d" , leaderID , maxVotes , cliLen )
257
- return nil , nil , nil , wrapError (anyError , message )
265
+ if err := h .Serving (); err != nil {
266
+ return nil , nil , nil , err
258
267
}
259
268
260
- // From here on, a leader with quorum is known.
261
- for i , config := range configs {
262
- if config != nil && config .Configuration .ID == leaderID {
263
- return clients [i ], config , h , nil
269
+ for id := range clients {
270
+ if h .leaderID == h .names [id ] {
271
+ return clients [id ], configs [id ], h , nil
264
272
}
265
273
}
266
274
267
- return nil , nil , nil , wrapError (anyError , "the leader is not responsive" )
268
- }
269
-
270
- func wrapError (err error , message string ) error {
271
- if err != nil {
272
- return errors .WithMessage (err , message )
273
- }
274
-
275
- return errors .New (message )
275
+ return nil , nil , nil , errors .Newf ("Unable to find agent" )
276
276
}
0 commit comments