@@ -15,6 +15,9 @@ import (
15
15
"github.com/nais/api/internal/leaderelection"
16
16
"github.com/sirupsen/logrus"
17
17
"github.com/sourcegraph/conc/pool"
18
+ "go.opentelemetry.io/otel"
19
+ "go.opentelemetry.io/otel/attribute"
20
+ "go.opentelemetry.io/otel/metric"
18
21
eventv1 "k8s.io/api/events/v1"
19
22
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
20
23
"k8s.io/apimachinery/pkg/watch"
@@ -29,21 +32,37 @@ type Watcher struct {
29
32
wg * pool.ContextPool
30
33
31
34
// State returns true when the watcher should be started/continue running and false when it should stop.
32
- state []chan bool
35
+ state []chan bool
36
+ eventsCounter metric.Int64Counter
37
+ handlersCounter metric.Int64UpDownCounter
33
38
}
34
39
35
- func NewWatcher (pool * pgxpool.Pool , clients map [string ]kubernetes.Interface , log logrus.FieldLogger ) * Watcher {
40
+ func NewWatcher (pool * pgxpool.Pool , clients map [string ]kubernetes.Interface , log logrus.FieldLogger ) ( * Watcher , error ) {
36
41
chs := make ([]chan bool , 0 , len (clients ))
37
42
for range clients {
38
43
chs = append (chs , make (chan bool , 1 ))
39
44
}
40
- return & Watcher {
41
- clients : clients ,
42
- events : make (chan eventsql.UpsertParams , 20 ),
43
- queries : eventsql .New (pool ),
44
- log : log ,
45
- state : chs ,
45
+
46
+ meter := otel .GetMeterProvider ().Meter ("nais_api_k8s_events" )
47
+ eventsCounter , err := meter .Int64Counter ("nais_api_k8s_events_total" , metric .WithDescription ("Number of events processed" ))
48
+ if err != nil {
49
+ return nil , fmt .Errorf ("creating events counter: %w" , err )
46
50
}
51
+
52
+ handlersCounter , err := meter .Int64UpDownCounter ("nais_api_k8s_handlers" , metric .WithDescription ("number of goroutines handling events" ))
53
+ if err != nil {
54
+ return nil , fmt .Errorf ("creating handlers counter: %w" , err )
55
+ }
56
+
57
+ return & Watcher {
58
+ clients : clients ,
59
+ events : make (chan eventsql.UpsertParams , 20 ),
60
+ queries : eventsql .New (pool ),
61
+ log : log ,
62
+ state : chs ,
63
+ eventsCounter : eventsCounter ,
64
+ handlersCounter : handlersCounter ,
65
+ }, nil
47
66
}
48
67
49
68
func (w * Watcher ) Run (ctx context.Context ) {
@@ -114,6 +133,9 @@ func (w *Watcher) run(ctx context.Context, env string, client kubernetes.Interfa
114
133
}
115
134
116
135
func (w * Watcher ) watch (ctx context.Context , env string , client kubernetes.Interface , state chan bool ) error {
136
+ w .handlersCounter .Add (ctx , 1 , metric .WithAttributes (attribute .String ("environment" , env )))
137
+ defer w .handlersCounter .Add (ctx , - 1 , metric .WithAttributes (attribute .String ("environment" , env )))
138
+
117
139
// Events we want to watch for
118
140
// SuccessfulRescale - Check for successful rescale events
119
141
// Killing - Check for liveness failures
@@ -125,21 +147,28 @@ func (w *Watcher) watch(ctx context.Context, env string, client kubernetes.Inter
125
147
126
148
w .log .WithField ("len" , len (list .Items )).Debug ("listed events" )
127
149
150
+ closeAndDrain := func (w watch.Interface ) {
151
+ w .Stop ()
152
+ for range w .ResultChan () {
153
+ // Drain the channel
154
+ }
155
+ }
156
+
128
157
rescale , err := client .EventsV1 ().Events ("" ).Watch (ctx , metav1.ListOptions {
129
158
FieldSelector : "reason=SuccessfulRescale,metadata.namespace!=nais-system" ,
130
159
})
131
160
if err != nil {
132
161
return fmt .Errorf ("failed to watch for rescale events: %w" , err )
133
162
}
134
- defer rescale . Stop ( )
163
+ defer closeAndDrain ( rescale )
135
164
136
165
killing , err := client .EventsV1 ().Events ("" ).Watch (ctx , metav1.ListOptions {
137
166
FieldSelector : "reason=Killing,metadata.namespace!=nais-system" ,
138
167
})
139
168
if err != nil {
140
169
return fmt .Errorf ("failed to watch for killing events: %w" , err )
141
170
}
142
- defer killing . Stop ( )
171
+ defer closeAndDrain ( killing )
143
172
144
173
handleEvent := func (event watch.Event , convert func (e * eventv1.Event ) (eventsql.UpsertParams , bool )) {
145
174
if event .Type != watch .Added && event .Type != watch .Modified {
@@ -174,6 +203,12 @@ func (w *Watcher) watch(ctx context.Context, env string, client kubernetes.Inter
174
203
return nil
175
204
}
176
205
case event := <- rescale .ResultChan ():
206
+ w .eventsCounter .Add (ctx , 1 , metric .WithAttributes (
207
+ attribute .String ("environment" , string (env )),
208
+ attribute .String ("type" , string (event .Type )),
209
+ attribute .String ("reason" , "SuccessfulRescale" )),
210
+ )
211
+
177
212
handleEvent (event , func (e * eventv1.Event ) (eventsql.UpsertParams , bool ) {
178
213
if ! strings .HasPrefix (e .Note , "New size" ) {
179
214
w .log .WithField ("note" , e .Note ).Debug ("ignoring event" )
@@ -203,6 +238,12 @@ func (w *Watcher) watch(ctx context.Context, env string, client kubernetes.Inter
203
238
return w .toUpsertParams (env , e , data )
204
239
})
205
240
case event := <- killing .ResultChan ():
241
+ w .eventsCounter .Add (ctx , 1 , metric .WithAttributes (
242
+ attribute .String ("environment" , string (env )),
243
+ attribute .String ("type" , string (event .Type )),
244
+ attribute .String ("reason" , "Killing" )),
245
+ )
246
+
206
247
handleEvent (event , func (e * eventv1.Event ) (eventsql.UpsertParams , bool ) {
207
248
if strings .HasSuffix (e .Note , "failed liveness probe, will be restarted" ) {
208
249
// Match `Container some-container-name failed liveness probe, will be restarted`
0 commit comments