Skip to content

Commit d6c4d36

Browse files
committed
icinga2: restart catch-up-phase on error
The catch-up-phase logic was extended to also propagate back an error if the state was left unsuccessfully. In this case - and not if another catch-up-phase was requested and the old phase worker got canceled - another attempt will be made.
1 parent f66f726 commit d6c4d36

File tree

2 files changed

+75
-26
lines changed

2 files changed

+75
-26
lines changed

internal/icinga2/client.go

+72-23
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package icinga2
22

33
import (
44
"context"
5+
"errors"
56
"github.com/icinga/icinga-notifications/internal/event"
67
"go.uber.org/zap"
78
"golang.org/x/sync/errgroup"
@@ -18,6 +19,15 @@ type eventMsg struct {
1819
apiTime time.Time
1920
}
2021

22+
// catchupEventMsg propagates either an eventMsg or an error back from the catch-up worker.
23+
//
24+
// The type must be used as a sum-type like data structure holding either an eventMsg pointer or an error. The error
25+
// should have a higher precedence than the eventMsg.
26+
type catchupEventMsg struct {
27+
*eventMsg
28+
error
29+
}
30+
2131
// Client for the Icinga 2 Event Stream API with support for other Icinga 2 APIs to gather additional information and
2232
// perform a catch-up of unknown events either when starting up to or in case of a connection loss.
2333
//
@@ -181,11 +191,14 @@ func (client *Client) buildAcknowledgementEvent(ctx context.Context, host, servi
181191
// startCatchupWorkers launches goroutines for catching up the Icinga 2 API state.
182192
//
183193
// Each event will be sent to the returned channel. When all launched workers have finished - either because all are
184-
// done or one has failed and the others were interrupted -, the channel will be closed. Those workers honor a context
185-
// derived from the Client.Ctx and would either stop when this context is done or when the context.CancelFunc is called.
186-
func (client *Client) startCatchupWorkers() (chan *eventMsg, context.CancelFunc) {
194+
// done or one has failed and the others were interrupted -, the channel will be closed. In case of a failure, _one_
195+
// final error will be sent back.
196+
//
197+
// Those workers honor a context derived from the Client.Ctx and would either stop when this context is done or when the
198+
// context.CancelFunc is called.
199+
func (client *Client) startCatchupWorkers() (chan *catchupEventMsg, context.CancelFunc) {
187200
startTime := time.Now()
188-
eventMsgCh := make(chan *eventMsg)
201+
catchupEventCh := make(chan *catchupEventMsg)
189202

190203
// Unfortunately, the errgroup context is hidden, that's why another context is necessary.
191204
ctx, cancel := context.WithCancel(client.Ctx)
@@ -195,7 +208,7 @@ func (client *Client) startCatchupWorkers() (chan *eventMsg, context.CancelFunc)
195208
for _, objType := range objTypes {
196209
objType := objType // https://go.dev/doc/faq#closures_and_goroutines
197210
group.Go(func() error {
198-
err := client.checkMissedChanges(groupCtx, objType, eventMsgCh)
211+
err := client.checkMissedChanges(groupCtx, objType, catchupEventCh)
199212
if err != nil {
200213
client.Logger.Errorw("Catch-up-phase event worker failed", zap.String("object type", objType), zap.Error(err))
201214
}
@@ -205,17 +218,27 @@ func (client *Client) startCatchupWorkers() (chan *eventMsg, context.CancelFunc)
205218

206219
go func() {
207220
err := group.Wait()
208-
if err != nil {
209-
client.Logger.Errorw("Catching up the API failed", zap.Error(err), zap.Duration("duration", time.Since(startTime)))
210-
} else {
221+
if err == nil {
211222
client.Logger.Infow("Catching up the API has finished", zap.Duration("duration", time.Since(startTime)))
223+
} else if errors.Is(err, context.Canceled) {
224+
// The context is either canceled when the Client got canceled or, more likely, when another catchup-worker
225+
// was requested. In the first case, the already sent messages will be discarded as the worker's main loop
226+
// was left. In the other case, the message buffers will be reset to an empty state.
227+
client.Logger.Warnw("Catching up the API was interrupted", zap.Duration("duration", time.Since(startTime)))
228+
} else {
229+
client.Logger.Errorw("Catching up the API failed", zap.Error(err), zap.Duration("duration", time.Since(startTime)))
230+
231+
select {
232+
case <-ctx.Done():
233+
case catchupEventCh <- &catchupEventMsg{error: err}:
234+
}
212235
}
213236

214237
cancel()
215-
close(eventMsgCh)
238+
close(catchupEventCh)
216239
}()
217240

218-
return eventMsgCh, cancel
241+
return catchupEventCh, cancel
219242
}
220243

221244
// worker is the Client's main background worker, taking care of event.Event dispatching and mode switching.
@@ -226,19 +249,30 @@ func (client *Client) startCatchupWorkers() (chan *eventMsg, context.CancelFunc)
226249
// Communication takes place over the eventDispatcherEventStream and catchupPhaseRequest channels.
227250
func (client *Client) worker() {
228251
var (
229-
// catchupEventCh emits events generated during the catch-up-phase from catch-up-workers. It will be closed when
230-
// catching up is done, which indicates the select below to switch phases. When this variable is nil, this
231-
// Client is in the normal operating phase.
232-
catchupEventCh chan *eventMsg
252+
// catchupEventCh either emits events generated during the catch-up-phase from catch-up-workers or one final
253+
// error if something went wrong. It will be closed when catching up is done, which indicates the select below
254+
// to switch phases. When this variable is nil, this Client is in the normal operating phase.
255+
catchupEventCh chan *catchupEventMsg
233256
// catchupCancel cancels, if not nil, all running catch-up-workers, e.g., when restarting catching-up.
234257
catchupCancel context.CancelFunc
235258

236259
// catchupBuffer holds Event Stream events to be replayed after the catch-up-phase has finished.
237260
catchupBuffer = make([]*event.Event, 0)
238261
// catchupCache maps event.Events.Name to API time to skip replaying outdated events.
239262
catchupCache = make(map[string]time.Time)
263+
264+
// catchupErr might hold an error received from catchupEventCh, indicating another catch-up-phase run.
265+
catchupErr error
240266
)
241267

268+
// catchupReset resets all catchup variables to their initial empty state.
269+
catchupReset := func() {
270+
catchupEventCh, catchupCancel = nil, nil
271+
catchupBuffer = make([]*event.Event, 0)
272+
catchupCache = make(map[string]time.Time)
273+
catchupErr = nil
274+
}
275+
242276
// catchupCacheUpdate updates the catchupCache if this eventMsg seems to be the latest of its kind.
243277
catchupCacheUpdate := func(ev *eventMsg) {
244278
ts, ok := catchupCache[ev.event.Name]
@@ -258,21 +292,28 @@ func (client *Client) worker() {
258292
client.Logger.Warn("Switching to catch-up-phase was requested while already catching up, restarting phase")
259293

260294
// Drain the old catch-up-phase producer channel until it is closed as its context will be canceled.
261-
go func(catchupEventCh chan *eventMsg) {
295+
go func(catchupEventCh chan *catchupEventMsg) {
262296
for _, ok := <-catchupEventCh; ok; {
263297
}
264298
}(catchupEventCh)
265299
catchupCancel()
266300
}
267301

268302
client.Logger.Info("Worker enters catch-up-phase, start caching up on Event Stream events")
303+
catchupReset()
269304
catchupEventCh, catchupCancel = client.startCatchupWorkers()
270305

271-
case ev, ok := <-catchupEventCh:
306+
case catchupMsg, ok := <-catchupEventCh:
272307
// Process an incoming event
273-
if ok {
274-
client.CallbackFn(ev.event)
275-
catchupCacheUpdate(ev)
308+
if ok && catchupMsg.error == nil {
309+
client.CallbackFn(catchupMsg.eventMsg.event)
310+
catchupCacheUpdate(catchupMsg.eventMsg)
311+
break
312+
}
313+
314+
// Store an incoming error as the catchupErr to be processed below
315+
if ok && catchupMsg.error != nil {
316+
catchupErr = catchupMsg.error
276317
break
277318
}
278319

@@ -295,11 +336,19 @@ func (client *Client) worker() {
295336
break
296337
}
297338

298-
client.Logger.Info("Worker leaves catch-up-phase, returning to normal operation")
339+
if catchupErr != nil {
340+
client.Logger.Warnw("Worker leaves catch-up-phase with an error, another attempt will be made", zap.Error(catchupErr))
341+
go func() {
342+
select {
343+
case <-client.Ctx.Done():
344+
case client.catchupPhaseRequest <- struct{}{}:
345+
}
346+
}()
347+
} else {
348+
client.Logger.Info("Worker leaves catch-up-phase, returning to normal operation")
349+
}
299350

300-
catchupEventCh, catchupCancel = nil, nil
301-
catchupBuffer = make([]*event.Event, 0)
302-
catchupCache = make(map[string]time.Time)
351+
catchupReset()
303352

304353
case ev := <-client.eventDispatcherEventStream:
305354
// During catch-up-phase, buffer Event Stream events

internal/icinga2/client_api.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ func (client *Client) fetchAcknowledgementComment(ctx context.Context, host, ser
187187
//
188188
// If the object's acknowledgement field is non-zero, an Acknowledgement Event will be constructed following the Host or
189189
// Service object. Each event will be delivered to the channel.
190-
func (client *Client) checkMissedChanges(ctx context.Context, objType string, eventCh chan *eventMsg) error {
190+
func (client *Client) checkMissedChanges(ctx context.Context, objType string, catchupEventCh chan *catchupEventMsg) error {
191191
jsonRaw, err := client.queryObjectsApiDirect(ctx, objType, "")
192192
if err != nil {
193193
return err
@@ -236,7 +236,7 @@ func (client *Client) checkMissedChanges(ctx context.Context, objType string, ev
236236
select {
237237
case <-ctx.Done():
238238
return ctx.Err()
239-
case eventCh <- &eventMsg{ev, objQueriesResult.Attrs.LastStateChange.Time()}:
239+
case catchupEventCh <- &catchupEventMsg{eventMsg: &eventMsg{ev, objQueriesResult.Attrs.LastStateChange.Time()}}:
240240
stateChangeEvents++
241241
}
242242

@@ -263,7 +263,7 @@ func (client *Client) checkMissedChanges(ctx context.Context, objType string, ev
263263
select {
264264
case <-ctx.Done():
265265
return ctx.Err()
266-
case eventCh <- &eventMsg{ev, objQueriesResult.Attrs.LastStateChange.Time()}:
266+
case catchupEventCh <- &catchupEventMsg{eventMsg: &eventMsg{ev, objQueriesResult.Attrs.LastStateChange.Time()}}:
267267
acknowledgementEvents++
268268
}
269269
}

0 commit comments

Comments
 (0)