Skip to content

Commit c567066

Browse files
committed
icinga2: Client fixes
- Rate limit catch-up-phase worker start. In case of a network disruption during the catch-up-phase, this will result in an error and infinite retries. Those, however, might result in lots of useless logging, which can be rate limited. - Remove the both useless and broken catchupEventCh drainage logic. All sends are being protected by context checks. - Abort early on errors received from the catchupEventCh and don't store them for later.
1 parent 64c91b8 commit c567066

File tree

1 file changed

+30
-27
lines changed

1 file changed

+30
-27
lines changed

internal/icinga2/client.go

+30-27
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"github.com/icinga/icinga-notifications/internal/event"
77
"go.uber.org/zap"
88
"golang.org/x/sync/errgroup"
9+
"math"
910
"net/http"
1011
"net/url"
1112
"time"
@@ -21,8 +22,8 @@ type eventMsg struct {
2122

2223
// catchupEventMsg propagates either an eventMsg or an error back from the catch-up worker.
2324
//
24-
// The type must be used as a sum-type like data structure holding either an eventMsg pointer or an error. The error
25-
// should have a higher precedence than the eventMsg.
25+
// The type must be used as a sum-type like data structure holding either an error or an eventMsg pointer. The error has
26+
// a higher precedence than the eventMsg.
2627
type catchupEventMsg struct {
2728
*eventMsg
2829
error
@@ -196,7 +197,10 @@ func (client *Client) buildAcknowledgementEvent(ctx context.Context, host, servi
196197
//
197198
// Those workers honor a context derived from the Client.Ctx and would either stop when this context is done or when the
198199
// context.CancelFunc is called.
199-
func (client *Client) startCatchupWorkers() (chan *catchupEventMsg, context.CancelFunc) {
200+
//
201+
// The startup time might be delayed through the parameter. This lets the goroutines sleep to rate-limit reconnection
202+
// attempts during network hiccups.
203+
func (client *Client) startCatchupWorkers(delay time.Duration) (chan *catchupEventMsg, context.CancelFunc) {
200204
startTime := time.Now()
201205
catchupEventCh := make(chan *catchupEventMsg)
202206

@@ -208,6 +212,12 @@ func (client *Client) startCatchupWorkers() (chan *catchupEventMsg, context.Canc
208212
for _, objType := range objTypes {
209213
objType := objType // https://go.dev/doc/faq#closures_and_goroutines
210214
group.Go(func() error {
215+
select {
216+
case <-ctx.Done():
217+
return ctx.Err()
218+
case <-time.After(delay):
219+
}
220+
211221
err := client.checkMissedChanges(groupCtx, objType, catchupEventCh)
212222
if err != nil {
213223
client.Logger.Errorw("Catch-up-phase event worker failed", zap.String("object type", objType), zap.Error(err))
@@ -261,16 +271,16 @@ func (client *Client) worker() {
261271
// catchupCache maps event.Events.Name to API time to skip replaying outdated events.
262272
catchupCache = make(map[string]time.Time)
263273

264-
// catchupErr might hold an error received from catchupEventCh, indicating another catch-up-phase run.
265-
catchupErr error
274+
// catchupFailCounter indicates how many prior catch-up-phase attempts have failed. It will be used to
275+
// rate limit catch-up-phase restarts.
276+
catchupFailCounter int
266277
)
267278

268279
// catchupReset resets all catchup variables to their initial empty state.
269280
catchupReset := func() {
270281
catchupEventCh, catchupCancel = nil, nil
271282
catchupBuffer = make([]*event.Event, 0)
272283
catchupCache = make(map[string]time.Time)
273-
catchupErr = nil
274284
}
275285

276286
// catchupCacheUpdate updates the catchupCache if this eventMsg seems to be the latest of its kind.
@@ -290,18 +300,13 @@ func (client *Client) worker() {
290300
case <-client.catchupPhaseRequest:
291301
if catchupEventCh != nil {
292302
client.Logger.Warn("Switching to catch-up-phase was requested while already catching up, restarting phase")
293-
294-
// Drain the old catch-up-phase producer channel until it is closed as its context will be canceled.
295-
go func(catchupEventCh chan *catchupEventMsg) {
296-
for _, ok := <-catchupEventCh; ok; {
297-
}
298-
}(catchupEventCh)
299303
catchupCancel()
300304
}
301305

302306
client.Logger.Info("Worker enters catch-up-phase, start caching up on Event Stream events")
303307
catchupReset()
304-
catchupEventCh, catchupCancel = client.startCatchupWorkers()
308+
catchupEventCh, catchupCancel = client.startCatchupWorkers(
309+
min(3*time.Minute, time.Duration(math.Exp2(float64(catchupFailCounter))-1)*time.Second))
305310

306311
case catchupMsg, ok := <-catchupEventCh:
307312
// Process an incoming event
@@ -311,9 +316,17 @@ func (client *Client) worker() {
311316
break
312317
}
313318

314-
// Store an incoming error as the catchupErr to be processed below
319+
// Abort and restart the catch-up-phase when receiving an error.
315320
if ok && catchupMsg.error != nil {
316-
catchupErr = catchupMsg.error
321+
client.Logger.Warnw("Worker leaves catch-up-phase with an error, another attempt will be made", zap.Error(catchupMsg.error))
322+
go func() {
323+
select {
324+
case <-client.Ctx.Done():
325+
case client.catchupPhaseRequest <- struct{}{}:
326+
}
327+
}()
328+
catchupReset()
329+
catchupFailCounter++
317330
break
318331
}
319332

@@ -336,19 +349,9 @@ func (client *Client) worker() {
336349
break
337350
}
338351

339-
if catchupErr != nil {
340-
client.Logger.Warnw("Worker leaves catch-up-phase with an error, another attempt will be made", zap.Error(catchupErr))
341-
go func() {
342-
select {
343-
case <-client.Ctx.Done():
344-
case client.catchupPhaseRequest <- struct{}{}:
345-
}
346-
}()
347-
} else {
348-
client.Logger.Info("Worker leaves catch-up-phase, returning to normal operation")
349-
}
350-
352+
client.Logger.Info("Worker leaves catch-up-phase, returning to normal operation")
351353
catchupReset()
354+
catchupFailCounter = 0
352355

353356
case ev := <-client.eventDispatcherEventStream:
354357
// During catch-up-phase, buffer Event Stream events

0 commit comments

Comments
 (0)