eventstream: rework Client's channels

oxzi · oxzi · commit 26a3295a7681 · 2023-10-31T16:28:34.000+01:00
This change started by creating two separate channels for events from
the Event Stream API and from the replay phase. During debugging, I came
across multiple bugs, which are somewhat all addressed here.

- I used buffered channels to pass events from the producers to the
  single dispatcher consumer, even as I should have known better. When
  the last replay producer has finished, another channel is used to
  communicate this state change. Because of my lazy buffered channel
  hack, the last events were raced by the finish signal.
- After restoring (channel processing) order, the producers need a way
  to quickly exit when the consumer has finished. Thus, a both reading
  and writing switch - checking the context's Done channel - was
  introduced to all producers.
- Some safeguard checks were introduced, which, e.g., detected the
  channel race error listed above.
- Somehow during a prior refactoring, the Client.fetchServiceGroups
  method was broken to also query the host groups instead of, as its
  name says, the service groups.
- My Docker-based testing environment sends SIGTERM instead of SIGINT,
  which, for other reasons, does not even reached the binary. Now
  SIGTERM is honored for the main context as well.
- Some documentation especially regarding error messages had either
  typos or grammatically mistakes.
diff --git a/cmd/icinga-notifications-daemon/main.go b/cmd/icinga-notifications-daemon/main.go
@@ -14,6 +14,7 @@ import (
 	"os"
 	"os/signal"
 	"runtime"
+	"syscall"
 	"time"
 )
 
@@ -75,7 +76,7 @@ func main() {
 		}
 	}
 
-	ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
+	ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
 	defer cancel()
 
 	runtimeConfig := config.NewRuntimeConfig(db, logs)
diff --git a/internal/eventstream/client.go b/internal/eventstream/client.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"crypto/tls"
 	"crypto/x509"
+	"errors"
 	"fmt"
 	"github.com/icinga/icinga-notifications/internal/config"
 	"github.com/icinga/icinga-notifications/internal/event"
@@ -21,12 +22,6 @@ import (
 
 // This file contains the main resp. common methods for the Client.
 
-// outgoingEvent is a wrapper around an event.Event and its producer's origin to be sent to the eventDispatcher.
-type outgoingEvent struct {
-	event           *event.Event
-	fromEventStream bool
-}
-
 // Client for the Icinga 2 Event Stream API with extended support for other Icinga 2 APIs to gather additional
 // information and allow a replay in case of a connection loss.
 type Client struct {
@@ -48,8 +43,11 @@ type Client struct {
 	// Logger to log to.
 	Logger *logging.Logger
 
-	// eventDispatch communicates Events to be processed between producer and consumer.
-	eventDispatch chan *outgoingEvent
+	// eventDispatcherEventStream communicates Events to be processed from the Event Stream API.
+	eventDispatcherEventStream chan *event.Event
+	// eventDispatcherReplay communicates Events to be processed from the Icinga 2 API replay during replay phase.
+	eventDispatcherReplay chan *event.Event
+
 	// replayTrigger signals the eventDispatcher method that the replay phase is finished.
 	replayTrigger chan struct{}
 	// replayPhase indicates that Events will be cached as the Event Stream Client is in the replay phase.
@@ -192,22 +190,22 @@ func (client *Client) buildHostServiceEvent(result CheckResult, state int, host,
 
 	if service != "" {
 		switch state {
-		case 0:
+		case 0: // OK
 			eventSeverity = event.SeverityOK
-		case 1:
+		case 1: // WARNING
 			eventSeverity = event.SeverityWarning
-		case 2:
+		case 2: // CRITICAL
 			eventSeverity = event.SeverityCrit
-		default:
+		default: // UNKNOWN or faulty
 			eventSeverity = event.SeverityErr
 		}
 	} else {
 		switch state {
-		case 0:
+		case 0: // UP
 			eventSeverity = event.SeverityOK
-		case 1:
+		case 1: // DOWN
 			eventSeverity = event.SeverityCrit
-		default:
+		default: // faulty
 			eventSeverity = event.SeverityErr
 		}
 	}
@@ -248,7 +246,7 @@ func (client *Client) eventDispatcher() {
 	for {
 		select {
 		case <-client.Ctx.Done():
-			client.Logger.Warnw("Closing event dispatcher as context is done", zap.Error(client.Ctx.Err()))
+			client.Logger.Warnw("Closing event dispatcher as its context is done", zap.Error(client.Ctx.Err()))
 			return
 
 		case <-client.replayTrigger:
@@ -259,12 +257,18 @@ func (client *Client) eventDispatcher() {
 			client.replayPhase.Store(false)
 			replayBuffer = []*event.Event{}
 
-		case ev := <-client.eventDispatch:
-			if client.replayPhase.Load() && ev.fromEventStream {
-				replayBuffer = append(replayBuffer, ev.event)
+		case ev := <-client.eventDispatcherEventStream:
+			if client.replayPhase.Load() {
+				replayBuffer = append(replayBuffer, ev)
 			} else {
-				client.CallbackFn(ev.event)
+				client.CallbackFn(ev)
+			}
+
+		case ev := <-client.eventDispatcherReplay:
+			if !client.replayPhase.Load() {
+				client.Logger.Errorw("Dispatcher received replay event during normal operation", zap.Stringer("event", ev))
 			}
+			client.CallbackFn(ev)
 		}
 	}
 }
@@ -275,7 +279,10 @@ func (client *Client) eventDispatcher() {
 // all of those have finished, the replayTrigger will be used to indicate that the buffered Events should be replayed.
 func (client *Client) enterReplayPhase() {
 	client.Logger.Info("Entering replay phase to replay stored events first")
-	client.replayPhase.Store(true)
+	if !client.replayPhase.CompareAndSwap(false, true) {
+		client.Logger.Error("The Event Stream Client is already in the replay phase")
+		return
+	}
 
 	queryFns := []func(string){client.checkMissedAcknowledgements, client.checkMissedStateChanges}
 	objTypes := []string{"host", "service"}
@@ -293,7 +300,9 @@ func (client *Client) enterReplayPhase() {
 	}
 
 	go func() {
+		startTime := time.Now()
 		replayWg.Wait()
+		client.Logger.Debugw("All replay phase workers have finished", zap.Duration("duration", time.Since(startTime)))
 		client.replayTrigger <- struct{}{}
 	}()
 }
@@ -304,22 +313,23 @@ func (client *Client) enterReplayPhase() {
 // loop takes care of reconnections, all those events will be logged while generated Events will be dispatched to the
 // callback function.
 func (client *Client) Process() {
-	// These two channels will be used to communicate the Events and are crucial. As there are multiple producers and
-	// only one consumer, eventDispatcher, there is no ideal closer. However, producers and the consumer will be
-	// finished by the Client's context. When this happens, the main application should either be stopped or the Client
-	// is restarted, and we can hope for the GC. To make sure that nothing gets stuck, make the event channel buffered.
-	client.eventDispatch = make(chan *outgoingEvent, 1024)
+	client.eventDispatcherEventStream = make(chan *event.Event)
+	client.eventDispatcherReplay = make(chan *event.Event)
 	client.replayTrigger = make(chan struct{})
 
-	defer client.Logger.Info("Event Stream Client has stopped")
-
 	go client.eventDispatcher()
 
 	for {
 		err := client.listenEventStream()
-		if err != nil {
+		switch {
+		case errors.Is(err, context.Canceled):
+			client.Logger.Warnw("Stopping Event Stream Client as its context is done", zap.Error(err))
+			return
+
+		case err != nil:
 			client.Logger.Errorw("Event Stream processing failed", zap.Error(err))
-		} else {
+
+		default:
 			client.Logger.Warn("Event Stream closed stream; maybe Icinga 2 is reloading")
 		}
 	}
diff --git a/internal/eventstream/client_api.go b/internal/eventstream/client_api.go
@@ -94,7 +94,7 @@ func (client *Client) queryObjectsApiQuery(objType string, query map[string]any)
 		})
 }
 
-// fetchHostGroup fetches all Host Groups for this host.
+// fetchHostGroups fetches all Host Groups for this host.
 func (client *Client) fetchHostGroups(host string) ([]string, error) {
 	jsonRaw, err := client.queryObjectsApiDirect("host", host)
 	if err != nil {
@@ -114,7 +114,7 @@ func (client *Client) fetchHostGroups(host string) ([]string, error) {
 
 // fetchServiceGroups fetches all Service Groups for this service on this host.
 func (client *Client) fetchServiceGroups(host, service string) ([]string, error) {
-	jsonRaw, err := client.queryObjectsApiDirect("host", host)
+	jsonRaw, err := client.queryObjectsApiDirect("service", host+"!"+service)
 	if err != nil {
 		return nil, err
 	}
@@ -231,15 +231,18 @@ func (client *Client) checkMissedChanges(objType, filterExpr string, attrsCallba
 // checkMissedStateChanges fetches all objects of the requested type and feeds them into the handler.
 func (client *Client) checkMissedStateChanges(objType string) {
 	client.checkMissedChanges(objType, "", func(attrs HostServiceRuntimeAttributes, host, service string) {
+		logger := client.Logger.With(zap.String("object type", objType))
+
 		ev, err := client.buildHostServiceEvent(attrs.LastCheckResult, attrs.State, host, service)
 		if err != nil {
-			client.Logger.Errorw("Failed to construct Event from API", zap.String("object type", objType), zap.Error(err))
+			logger.Errorw("Failed to construct Event from API", zap.Error(err))
 			return
 		}
 
-		client.eventDispatch <- &outgoingEvent{
-			event:           ev,
-			fromEventStream: false,
+		select {
+		case <-client.Ctx.Done():
+			logger.Warnw("Cannot dispatch replayed event as context is finished", zap.Error(client.Ctx.Err()))
+		case client.eventDispatcherReplay <- ev:
 		}
 	})
 }
@@ -264,9 +267,10 @@ func (client *Client) checkMissedAcknowledgements(objType string) {
 			return
 		}
 
-		client.eventDispatch <- &outgoingEvent{
-			event:           ev,
-			fromEventStream: false,
+		select {
+		case <-client.Ctx.Done():
+			logger.Warnw("Cannot dispatch replayed event as context is finished", zap.Error(client.Ctx.Err()))
+		case client.eventDispatcherReplay <- ev:
 		}
 	})
 }
@@ -356,9 +360,11 @@ func (client *Client) listenEventStream() error {
 			return err
 		}
 
-		client.eventDispatch <- &outgoingEvent{
-			event:           ev,
-			fromEventStream: true,
+		select {
+		case <-client.Ctx.Done():
+			client.Logger.Warnw("Cannot dispatch Event Stream event as context is finished", zap.Error(client.Ctx.Err()))
+			return client.Ctx.Err()
+		case client.eventDispatcherEventStream <- ev:
 		}
 	}
 	return lineScanner.Err()

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,7 @@ import (`
`14`	`14`	`"os"`
`15`	`15`	`"os/signal"`
`16`	`16`	`"runtime"`
	`17`	`+ "syscall"`
`17`	`18`	`"time"`
`18`	`19`	`)`
`19`	`20`
`@@ -75,7 +76,7 @@ func main() {`
`75`	`76`	`}`
`76`	`77`	`}`
`77`	`78`
`78`		`- ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)`
	`79`	`+ ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)`
`79`	`80`	`defer cancel()`
`80`	`81`
`81`	`82`	`runtimeConfig := config.NewRuntimeConfig(db, logs)`