@@ -2,6 +2,7 @@ package icinga2
2
2
3
3
import (
4
4
"context"
5
+ "errors"
5
6
"github.com/icinga/icinga-notifications/internal/event"
6
7
"go.uber.org/zap"
7
8
"golang.org/x/sync/errgroup"
@@ -18,6 +19,15 @@ type eventMsg struct {
18
19
apiTime time.Time
19
20
}
20
21
22
+ // catchupEventMsg propagates either an eventMsg or an error back from the catch-up worker.
23
+ //
24
+ // The type must be used as a sum-type like data structure holding either an eventMsg pointer or an error. The error
25
+ // should have a higher precedence than the eventMsg.
26
+ type catchupEventMsg struct {
27
+ * eventMsg
28
+ error
29
+ }
30
+
21
31
// Client for the Icinga 2 Event Stream API with support for other Icinga 2 APIs to gather additional information and
22
32
// perform a catch-up of unknown events either when starting up to or in case of a connection loss.
23
33
//
@@ -181,11 +191,14 @@ func (client *Client) buildAcknowledgementEvent(ctx context.Context, host, servi
181
191
// startCatchupWorkers launches goroutines for catching up the Icinga 2 API state.
182
192
//
183
193
// Each event will be sent to the returned channel. When all launched workers have finished - either because all are
184
- // done or one has failed and the others were interrupted -, the channel will be closed. Those workers honor a context
185
- // derived from the Client.Ctx and would either stop when this context is done or when the context.CancelFunc is called.
186
- func (client * Client ) startCatchupWorkers () (chan * eventMsg , context.CancelFunc ) {
194
+ // done or one has failed and the others were interrupted -, the channel will be closed. In case of a failure, _one_
195
+ // final error will be sent back.
196
+ //
197
+ // Those workers honor a context derived from the Client.Ctx and would either stop when this context is done or when the
198
+ // context.CancelFunc is called.
199
+ func (client * Client ) startCatchupWorkers () (chan * catchupEventMsg , context.CancelFunc ) {
187
200
startTime := time .Now ()
188
- eventMsgCh := make (chan * eventMsg )
201
+ catchupEventCh := make (chan * catchupEventMsg )
189
202
190
203
// Unfortunately, the errgroup context is hidden, that's why another context is necessary.
191
204
ctx , cancel := context .WithCancel (client .Ctx )
@@ -195,7 +208,7 @@ func (client *Client) startCatchupWorkers() (chan *eventMsg, context.CancelFunc)
195
208
for _ , objType := range objTypes {
196
209
objType := objType // https://go.dev/doc/faq#closures_and_goroutines
197
210
group .Go (func () error {
198
- err := client .checkMissedChanges (groupCtx , objType , eventMsgCh )
211
+ err := client .checkMissedChanges (groupCtx , objType , catchupEventCh )
199
212
if err != nil {
200
213
client .Logger .Errorw ("Catch-up-phase event worker failed" , zap .String ("object type" , objType ), zap .Error (err ))
201
214
}
@@ -205,17 +218,27 @@ func (client *Client) startCatchupWorkers() (chan *eventMsg, context.CancelFunc)
205
218
206
219
go func () {
207
220
err := group .Wait ()
208
- if err != nil {
209
- client .Logger .Errorw ("Catching up the API failed" , zap .Error (err ), zap .Duration ("duration" , time .Since (startTime )))
210
- } else {
221
+ if err == nil {
211
222
client .Logger .Infow ("Catching up the API has finished" , zap .Duration ("duration" , time .Since (startTime )))
223
+ } else if errors .Is (err , context .Canceled ) {
224
+ // The context is either canceled when the Client got canceled or, more likely, when another catchup-worker
225
+ // was requested. In the first case, the already sent messages will be discarded as the worker's main loop
226
+ // was left. In the other case, the message buffers will be reset to an empty state.
227
+ client .Logger .Warnw ("Catching up the API was interrupted" , zap .Duration ("duration" , time .Since (startTime )))
228
+ } else {
229
+ client .Logger .Errorw ("Catching up the API failed" , zap .Error (err ), zap .Duration ("duration" , time .Since (startTime )))
230
+
231
+ select {
232
+ case <- ctx .Done ():
233
+ case catchupEventCh <- & catchupEventMsg {error : err }:
234
+ }
212
235
}
213
236
214
237
cancel ()
215
- close (eventMsgCh )
238
+ close (catchupEventCh )
216
239
}()
217
240
218
- return eventMsgCh , cancel
241
+ return catchupEventCh , cancel
219
242
}
220
243
221
244
// worker is the Client's main background worker, taking care of event.Event dispatching and mode switching.
@@ -226,19 +249,30 @@ func (client *Client) startCatchupWorkers() (chan *eventMsg, context.CancelFunc)
226
249
// Communication takes place over the eventDispatcherEventStream and catchupPhaseRequest channels.
227
250
func (client * Client ) worker () {
228
251
var (
229
- // catchupEventCh emits events generated during the catch-up-phase from catch-up-workers. It will be closed when
230
- // catching up is done, which indicates the select below to switch phases. When this variable is nil, this
231
- // Client is in the normal operating phase.
232
- catchupEventCh chan * eventMsg
252
+ // catchupEventCh either emits events generated during the catch-up-phase from catch-up-workers or one final
253
+ // error if something went wrong. It will be closed when catching up is done, which indicates the select below
254
+ // to switch phases. When this variable is nil, this Client is in the normal operating phase.
255
+ catchupEventCh chan * catchupEventMsg
233
256
// catchupCancel cancels, if not nil, all running catch-up-workers, e.g., when restarting catching-up.
234
257
catchupCancel context.CancelFunc
235
258
236
259
// catchupBuffer holds Event Stream events to be replayed after the catch-up-phase has finished.
237
260
catchupBuffer = make ([]* event.Event , 0 )
238
261
// catchupCache maps event.Events.Name to API time to skip replaying outdated events.
239
262
catchupCache = make (map [string ]time.Time )
263
+
264
+ // catchupErr might hold an error received from catchupEventCh, indicating another catch-up-phase run.
265
+ catchupErr error
240
266
)
241
267
268
+ // catchupReset resets all catchup variables to their initial empty state.
269
+ catchupReset := func () {
270
+ catchupEventCh , catchupCancel = nil , nil
271
+ catchupBuffer = make ([]* event.Event , 0 )
272
+ catchupCache = make (map [string ]time.Time )
273
+ catchupErr = nil
274
+ }
275
+
242
276
// catchupCacheUpdate updates the catchupCache if this eventMsg seems to be the latest of its kind.
243
277
catchupCacheUpdate := func (ev * eventMsg ) {
244
278
ts , ok := catchupCache [ev .event .Name ]
@@ -258,21 +292,28 @@ func (client *Client) worker() {
258
292
client .Logger .Warn ("Switching to catch-up-phase was requested while already catching up, restarting phase" )
259
293
260
294
// Drain the old catch-up-phase producer channel until it is closed as its context will be canceled.
261
- go func (catchupEventCh chan * eventMsg ) {
295
+ go func (catchupEventCh chan * catchupEventMsg ) {
262
296
for _ , ok := <- catchupEventCh ; ok ; {
263
297
}
264
298
}(catchupEventCh )
265
299
catchupCancel ()
266
300
}
267
301
268
302
client .Logger .Info ("Worker enters catch-up-phase, start caching up on Event Stream events" )
303
+ catchupReset ()
269
304
catchupEventCh , catchupCancel = client .startCatchupWorkers ()
270
305
271
- case ev , ok := <- catchupEventCh :
306
+ case catchupMsg , ok := <- catchupEventCh :
272
307
// Process an incoming event
273
- if ok {
274
- client .CallbackFn (ev .event )
275
- catchupCacheUpdate (ev )
308
+ if ok && catchupMsg .error == nil {
309
+ client .CallbackFn (catchupMsg .eventMsg .event )
310
+ catchupCacheUpdate (catchupMsg .eventMsg )
311
+ break
312
+ }
313
+
314
+ // Store an incoming error as the catchupErr to be processed below
315
+ if ok && catchupMsg .error != nil {
316
+ catchupErr = catchupMsg .error
276
317
break
277
318
}
278
319
@@ -295,11 +336,19 @@ func (client *Client) worker() {
295
336
break
296
337
}
297
338
298
- client .Logger .Info ("Worker leaves catch-up-phase, returning to normal operation" )
339
+ if catchupErr != nil {
340
+ client .Logger .Warnw ("Worker leaves catch-up-phase with an error, another attempt will be made" , zap .Error (catchupErr ))
341
+ go func () {
342
+ select {
343
+ case <- client .Ctx .Done ():
344
+ case client .catchupPhaseRequest <- struct {}{}:
345
+ }
346
+ }()
347
+ } else {
348
+ client .Logger .Info ("Worker leaves catch-up-phase, returning to normal operation" )
349
+ }
299
350
300
- catchupEventCh , catchupCancel = nil , nil
301
- catchupBuffer = make ([]* event.Event , 0 )
302
- catchupCache = make (map [string ]time.Time )
351
+ catchupReset ()
303
352
304
353
case ev := <- client .eventDispatcherEventStream :
305
354
// During catch-up-phase, buffer Event Stream events
0 commit comments