eventstream: rework replay communication logic

oxzi · oxzi · commit 500b2a974d5c · 2023-11-07T11:31:57.000+01:00
Prior, two variables - one signaling channel and an atomic.Bool - were
holding and/or communicating if the eventDispatcher was in its replay
phase. Especially the atomic.Bool variable was accessed both in the
producer - enterReplayPhase - as well as the consumer - eventDispatcher.

After this rework, the whole logic went into the main worker, previously
named eventDispatcher. By using a single channel - replayPhaseRequest -,
the worker might now switch to the replay phase. This eases and unifies
the internal "API" as all communication with the worker takes place over
unidirectional channels.
diff --git a/internal/eventstream/client.go b/internal/eventstream/client.go
@@ -15,7 +15,6 @@ import (
 	"net/http"
 	"net/url"
 	"os"
-	"sync/atomic"
 	"time"
 )
 
@@ -28,7 +27,15 @@ type eventMsg struct {
 }
 
 // Client for the Icinga 2 Event Stream API with extended support for other Icinga 2 APIs to gather additional
-// information and allow a replay in case of a connection loss.
+// information and a replay either when starting up to catch up the Icinga's state or in case of a connection loss.
+//
+// Within the icinga-notifications scope, one or multiple Client instances can be generated from the configuration by
+// calling NewClientsFromConfig.
+//
+// A Client must be started by calling its Process method, which blocks until Ctx is marked as done. Reconnections and
+// the necessary state replaying from the Icinga 2 API will be taken care off. Internally, the Client executes a worker
+// within its own goroutine, which dispatches event.Event to the CallbackFn and enforces event.Event order during
+// replaying after (re-)connections.
 type Client struct {
 	// ApiHost et al. configure where and how the Icinga 2 API can be reached.
 	ApiHost          string
@@ -52,11 +59,8 @@ type Client struct {
 	eventDispatcherEventStream chan *eventMsg
 	// eventDispatcherReplay communicates Events to be processed from the Icinga 2 API replay during replay phase.
 	eventDispatcherReplay chan *eventMsg
-
-	// replayTrigger signals the eventDispatcher method that the replay phase is finished.
-	replayTrigger chan struct{}
-	// replayPhase indicates that Events will be cached as the Event Stream Client is in the replay phase.
-	replayPhase atomic.Bool
+	// replayPhaseRequest requests the main worker to switch to the replay phase and re-request the Icinga 2 API.
+	replayPhaseRequest chan struct{}
 }
 
 // NewClientsFromConfig returns all Clients defined in the conf.ConfigFile.
@@ -252,11 +256,50 @@ func (client *Client) buildAcknowledgementEvent(ctx context.Context, host, servi
 	return ev, nil
 }
 
-// eventDispatcher receives generated event.Events to be either buffered or directly delivered to the CallbackFn.
+// startReplayWorker launches goroutines for replaying the Icinga 2 API state.
 //
-// When the Client is in the replay phase, events from the Event Stream API will be cached until the replay phase has
-// finished, while replayed events will be delivered directly.
-func (client *Client) eventDispatcher() {
+// When all launched workers have finished - either because all are done or one has failed and the others were
+// interrupted -, the returned channel will be closed. Those workers honor the Ctx and would also fail when the main
+// context is done.
+func (client *Client) startReplayWorker() chan struct{} {
+	startTime := time.Now()
+	group, groupCtx := errgroup.WithContext(client.Ctx)
+
+	objTypes := []string{"host", "service"}
+	for _, objType := range objTypes {
+		objType := objType // https://go.dev/doc/faq#closures_and_goroutines
+		group.Go(func() error {
+			err := client.checkMissedChanges(groupCtx, objType)
+			if err != nil {
+				client.Logger.Errorw("Replaying API events resulted in errors",
+					zap.String("object type", objType), zap.Error(err))
+			}
+			return err
+		})
+	}
+
+	finCh := make(chan struct{})
+	go func() {
+		err := group.Wait()
+		if err != nil {
+			client.Logger.Errorw("Replaying the API resulted in errors", zap.Error(err), zap.Duration("duration", time.Since(startTime)))
+		} else {
+			client.Logger.Infow("Replaying the API has finished", zap.Duration("duration", time.Since(startTime)))
+		}
+
+		close(finCh)
+	}()
+
+	return finCh
+}
+
+// worker is the Client's main background worker, taking care of event.Event dispatching and mode switching.
+//
+// When the Client is in the replay phase, requested by replayPhaseRequest, events from the Event Stream API will
+// be cached until the replay phase has finished, while replayed events will be delivered directly.
+//
+// Communication takes place over the channels eventDispatcherEventStream, eventDispatcherReplay and replayPhaseRequest.
+func (client *Client) worker() {
 	// eventCache is a subset of event.Event used for caching those in replayCache below.
 	type eventCache struct {
 		SourceId int64
@@ -265,6 +308,11 @@ func (client *Client) eventDispatcher() {
 	}
 
 	var (
+		// replayFinCh holds a reference when the Client is in the replay phase. It will be closed from the producer,
+		// startReplayWorker, when replaying is finished, which indicates the select below to switch phases. When this
+		// variable is nil, the Client is in the normal operating phase.
+		replayFinCh chan struct{}
+
 		// replayBuffer holds Event Stream events to be replayed after the replay phase has finished.
 		replayBuffer = make([]*event.Event, 0)
 		// replayCache maps eventCache derived from event.Events to API time to skip replaying outdated events.
@@ -283,10 +331,21 @@ func (client *Client) eventDispatcher() {
 	for {
 		select {
 		case <-client.Ctx.Done():
-			client.Logger.Warnw("Closing event dispatcher as its context is done", zap.Error(client.Ctx.Err()))
+			client.Logger.Warnw("Closing down main worker as context is finished", zap.Error(client.Ctx.Err()))
 			return
 
-		case <-client.replayTrigger:
+		case <-client.replayPhaseRequest:
+			if replayFinCh != nil {
+				// There shouldn't be multiple concurrent startReplayWorker calls. However, technically this is possible,
+				// i.e. when the Icinga 2 API connection is flapping.
+				client.Logger.Error("Another replay phase request was sent while the Client is already replaying")
+				break
+			}
+
+			client.Logger.Debug("Dispatcher enters replay phase, starting caching Event Stream events")
+			replayFinCh = client.startReplayWorker()
+
+		case <-replayFinCh:
 			skipCounter := 0
 			for _, ev := range replayBuffer {
 				ts, ok := replayCache[eventCache{ev.SourceId, ev.Name, ev.Type}]
@@ -299,26 +358,26 @@ func (client *Client) eventDispatcher() {
 
 				client.CallbackFn(ev)
 			}
-			client.Logger.Infow("Finished replay phase, returning to normal operation",
+			client.Logger.Infow("Dispatcher leaves replay phase, returning to normal operation",
 				zap.Int("cached events", len(replayBuffer)), zap.Int("skipped events", skipCounter))
 
+			replayFinCh = nil
 			replayBuffer = make([]*event.Event, 0)
 			replayCache = make(map[eventCache]time.Time)
-			client.replayPhase.Store(false)
 
 		case ev := <-client.eventDispatcherEventStream:
-			if !client.replayPhase.Load() {
-				client.CallbackFn(ev.event)
-				continue
+			if replayFinCh != nil {
+				replayBuffer = append(replayBuffer, ev.event)
+				replayCacheUpdate(ev)
+				break
 			}
 
-			replayBuffer = append(replayBuffer, ev.event)
-			replayCacheUpdate(ev)
+			client.CallbackFn(ev.event)
 
 		case ev := <-client.eventDispatcherReplay:
-			if !client.replayPhase.Load() {
-				client.Logger.Errorw("Dispatcher received replay event during normal operation", zap.Stringer("event", ev.event))
-				continue
+			if replayFinCh == nil {
+				client.Logger.Errorw("Dispatcher received replay event outside of the replay phase", zap.Stringer("event", ev.event))
+				break
 			}
 
 			client.CallbackFn(ev.event)
@@ -327,45 +386,6 @@ func (client *Client) eventDispatcher() {
 	}
 }
 
-// enterReplayPhase enters the replay phase for the initial sync and after reconnections.
-//
-// This method starts multiple goroutines. First, some workers to query the Icinga 2 Objects API will be launched. When
-// all of those have finished, the replayTrigger will be used to indicate that the buffered Events should be replayed.
-func (client *Client) enterReplayPhase() {
-	client.Logger.Info("Entering replay phase to replay stored events first")
-	if !client.replayPhase.CompareAndSwap(false, true) {
-		client.Logger.Error("The Event Stream Client is already in the replay phase")
-		return
-	}
-
-	group, groupCtx := errgroup.WithContext(client.Ctx)
-	objTypes := []string{"host", "service"}
-	for _, objType := range objTypes {
-		objType := objType // https://go.dev/doc/faq#closures_and_goroutines
-		group.Go(func() error {
-			err := client.checkMissedChanges(groupCtx, objType)
-			if err != nil {
-				client.Logger.Errorw("Replaying API events resulted in errors",
-					zap.String("object type", objType), zap.Error(err))
-			}
-			return err
-		})
-	}
-
-	go func() {
-		startTime := time.Now()
-
-		err := group.Wait()
-		if err != nil {
-			client.Logger.Errorw("Replaying the API resulted in errors", zap.Error(err), zap.Duration("duration", time.Since(startTime)))
-		} else {
-			client.Logger.Debugw("All replay phase workers have finished", zap.Duration("duration", time.Since(startTime)))
-		}
-
-		client.replayTrigger <- struct{}{}
-	}()
-}
-
 // Process incoming objects and reconnect to the Event Stream with replaying objects if necessary.
 //
 // This method blocks as long as the Client runs, which, unless its context is cancelled, is forever. While its internal
@@ -374,9 +394,9 @@ func (client *Client) enterReplayPhase() {
 func (client *Client) Process() {
 	client.eventDispatcherEventStream = make(chan *eventMsg)
 	client.eventDispatcherReplay = make(chan *eventMsg)
-	client.replayTrigger = make(chan struct{})
+	client.replayPhaseRequest = make(chan struct{})
 
-	go client.eventDispatcher()
+	go client.worker()
 
 	for {
 		err := client.listenEventStream()
diff --git a/internal/eventstream/client_api.go b/internal/eventstream/client_api.go
@@ -349,7 +349,12 @@ func (client *Client) listenEventStream() error {
 	defer cancel()
 	defer func() { _ = response.Body.Close() }()
 
-	client.enterReplayPhase()
+	select {
+	case <-client.Ctx.Done():
+		client.Logger.Warnw("Cannot request starting replay phase as context is finished", zap.Error(client.Ctx.Err()))
+		return client.Ctx.Err()
+	case client.replayPhaseRequest <- struct{}{}:
+	}
 
 	client.Logger.Info("Start listening on Icinga 2 Event Stream..")