Skip to content

Commit 0aab12f

Browse files
Israel Blancasswiatekm
Israel Blancas
authored andcommitted
Fix error when the operator metrics ServiceMonitor already exists (open-telemetry#3447)
* Fix error when the operator metrics ServiceMonitor already exists Signed-off-by: Israel Blancas <[email protected]> * Wrap the errors and log them Signed-off-by: Israel Blancas <[email protected]> --------- Signed-off-by: Israel Blancas <[email protected]> (cherry picked from commit 917b096)
1 parent 99b6c6f commit 0aab12f

File tree

4 files changed

+167
-22
lines changed

4 files changed

+167
-22
lines changed

.chloggen/3446.yaml

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
2+
change_type: 'bug_fix'
3+
4+
# The name of the component, or a single word describing the area of concern, (e.g. collector, target allocator, auto-instrumentation, opamp, github action)
5+
component: operator
6+
7+
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
8+
note: Operator pod crashed if the Service Monitor for the operator metrics was created before by another operator pod.
9+
10+
# One or more tracking issues related to the change
11+
issues: [3446]
12+
13+
# (Optional) One or more lines of additional information to render under the primary note.
14+
# These lines will be padded with 2 spaces and then inserted directly into the document.
15+
# Use pipe (|) for multiline entries.
16+
subtext: |
17+
Operator fails when the pod is restarted and the Service Monitor for operator metrics was already created by another operator pod.
18+
To fix this, the operator now sets the owner reference on the Service Monitor to itself and checks if the Service Monitor already exists.
19+

internal/operator-metrics/metrics.go

+67-15
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,11 @@ import (
1919
"fmt"
2020
"os"
2121

22+
"github.com/go-logr/logr"
2223
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
24+
appsv1 "k8s.io/api/apps/v1"
2325
corev1 "k8s.io/api/core/v1"
26+
apierrors "k8s.io/apimachinery/pkg/api/errors"
2427
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2528
"k8s.io/apimachinery/pkg/runtime"
2629
"k8s.io/apimachinery/pkg/util/intstr"
@@ -51,26 +54,85 @@ var _ manager.Runnable = &OperatorMetrics{}
5154

5255
type OperatorMetrics struct {
5356
kubeClient client.Client
57+
log logr.Logger
5458
}
5559

56-
func NewOperatorMetrics(config *rest.Config, scheme *runtime.Scheme) (OperatorMetrics, error) {
60+
func NewOperatorMetrics(config *rest.Config, scheme *runtime.Scheme, log logr.Logger) (OperatorMetrics, error) {
5761
kubeClient, err := client.New(config, client.Options{Scheme: scheme})
5862
if err != nil {
5963
return OperatorMetrics{}, err
6064
}
6165

6266
return OperatorMetrics{
6367
kubeClient: kubeClient,
68+
log: log,
6469
}, nil
6570
}
6671

6772
func (om OperatorMetrics) Start(ctx context.Context) error {
73+
err := om.createOperatorMetricsServiceMonitor(ctx)
74+
if err != nil {
75+
om.log.Error(err, "error creating Service Monitor for operator metrics")
76+
}
77+
78+
return nil
79+
}
80+
81+
func (om OperatorMetrics) NeedLeaderElection() bool {
82+
return true
83+
}
84+
85+
func (om OperatorMetrics) caConfigMapExists() bool {
86+
return om.kubeClient.Get(context.Background(), client.ObjectKey{
87+
Name: caBundleConfigMap,
88+
Namespace: openshiftInClusterMonitoringNamespace,
89+
}, &corev1.ConfigMap{},
90+
) == nil
91+
}
92+
93+
func (om OperatorMetrics) getOwnerReferences(ctx context.Context, namespace string) (metav1.OwnerReference, error) {
94+
var deploymentList appsv1.DeploymentList
95+
96+
listOptions := []client.ListOption{
97+
client.InNamespace(namespace),
98+
client.MatchingLabels(map[string]string{
99+
"app.kubernetes.io/name": "opentelemetry-operator",
100+
"control-plane": "controller-manager",
101+
}),
102+
}
103+
104+
err := om.kubeClient.List(ctx, &deploymentList, listOptions...)
105+
if err != nil {
106+
return metav1.OwnerReference{}, err
107+
}
108+
109+
if len(deploymentList.Items) == 0 {
110+
return metav1.OwnerReference{}, fmt.Errorf("no deployments found with the specified label")
111+
}
112+
deployment := &deploymentList.Items[0]
113+
114+
ownerRef := metav1.OwnerReference{
115+
APIVersion: "apps/v1",
116+
Kind: "Deployment",
117+
Name: deployment.Name,
118+
UID: deployment.UID,
119+
}
120+
121+
return ownerRef, nil
122+
}
123+
124+
func (om OperatorMetrics) createOperatorMetricsServiceMonitor(ctx context.Context) error {
68125
rawNamespace, err := os.ReadFile(namespaceFile)
69126
if err != nil {
70127
return fmt.Errorf("error reading namespace file: %w", err)
71128
}
72129
namespace := string(rawNamespace)
73130

131+
ownerRef, err := om.getOwnerReferences(ctx, namespace)
132+
if err != nil {
133+
return fmt.Errorf("error getting owner references: %w", err)
134+
}
135+
74136
var tlsConfig *monitoringv1.TLSConfig
75137

76138
if om.caConfigMapExists() {
@@ -101,6 +163,7 @@ func (om OperatorMetrics) Start(ctx context.Context) error {
101163
"app.kubernetes.io/part-of": "opentelemetry-operator",
102164
"control-plane": "controller-manager",
103165
},
166+
OwnerReferences: []metav1.OwnerReference{ownerRef},
104167
},
105168
Spec: monitoringv1.ServiceMonitorSpec{
106169
Selector: metav1.LabelSelector{
@@ -123,23 +186,12 @@ func (om OperatorMetrics) Start(ctx context.Context) error {
123186
}
124187

125188
err = om.kubeClient.Create(ctx, &sm)
126-
if err != nil {
127-
return fmt.Errorf("error creating service monitor: %w", err)
189+
// The ServiceMonitor can be already there if this is a restart
190+
if err != nil && !apierrors.IsAlreadyExists(err) {
191+
return err
128192
}
129193

130194
<-ctx.Done()
131195

132196
return om.kubeClient.Delete(ctx, &sm)
133197
}
134-
135-
func (om OperatorMetrics) NeedLeaderElection() bool {
136-
return true
137-
}
138-
139-
func (om OperatorMetrics) caConfigMapExists() bool {
140-
return om.kubeClient.Get(context.Background(), client.ObjectKey{
141-
Name: caBundleConfigMap,
142-
Namespace: openshiftInClusterMonitoringNamespace,
143-
}, &corev1.ConfigMap{},
144-
) == nil
145-
}

internal/operator-metrics/metrics_test.go

+80-6
Original file line numberDiff line numberDiff line change
@@ -17,26 +17,30 @@ package operatormetrics
1717
import (
1818
"context"
1919
"os"
20+
"reflect"
2021
"testing"
2122
"time"
2223

24+
"github.com/go-logr/logr"
2325
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
2426
"github.com/stretchr/testify/assert"
2527
"github.com/stretchr/testify/require"
28+
appsv1 "k8s.io/api/apps/v1"
2629
corev1 "k8s.io/api/core/v1"
2730
apierrors "k8s.io/apimachinery/pkg/api/errors"
2831
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2932
"k8s.io/apimachinery/pkg/runtime"
3033
"k8s.io/apimachinery/pkg/types"
3134
"k8s.io/apimachinery/pkg/util/wait"
3235
"k8s.io/client-go/rest"
36+
"sigs.k8s.io/controller-runtime/pkg/client"
3337
"sigs.k8s.io/controller-runtime/pkg/client/fake"
3438
)
3539

3640
func TestNewOperatorMetrics(t *testing.T) {
3741
config := &rest.Config{}
3842
scheme := runtime.NewScheme()
39-
metrics, err := NewOperatorMetrics(config, scheme)
43+
metrics, err := NewOperatorMetrics(config, scheme, logr.Discard())
4044
assert.NoError(t, err)
4145
assert.NotNil(t, metrics.kubeClient)
4246
}
@@ -53,12 +57,15 @@ func TestOperatorMetrics_Start(t *testing.T) {
5357
namespaceFile = tmpFile.Name()
5458

5559
scheme := runtime.NewScheme()
56-
err = corev1.AddToScheme(scheme)
57-
require.NoError(t, err)
58-
err = monitoringv1.AddToScheme(scheme)
59-
require.NoError(t, err)
60+
require.NoError(t, corev1.AddToScheme(scheme))
61+
require.NoError(t, appsv1.AddToScheme(scheme))
62+
require.NoError(t, monitoringv1.AddToScheme(scheme))
6063

61-
client := fake.NewClientBuilder().WithScheme(scheme).Build()
64+
client := fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects(
65+
&appsv1.Deployment{
66+
ObjectMeta: metav1.ObjectMeta{Name: "opentelemetry-operator", Namespace: "test-namespace", Labels: map[string]string{"app.kubernetes.io/name": "opentelemetry-operator", "control-plane": "controller-manager"}},
67+
},
68+
).Build()
6269

6370
metrics := OperatorMetrics{kubeClient: client}
6471

@@ -125,3 +132,70 @@ func TestOperatorMetrics_caConfigMapExists(t *testing.T) {
125132
metricsWithoutConfigMap := OperatorMetrics{kubeClient: clientWithoutConfigMap}
126133
assert.False(t, metricsWithoutConfigMap.caConfigMapExists())
127134
}
135+
136+
func TestOperatorMetrics_getOwnerReferences(t *testing.T) {
137+
tests := []struct {
138+
name string
139+
namespace string
140+
objects []client.Object
141+
want metav1.OwnerReference
142+
wantErr bool
143+
}{
144+
{
145+
name: "successful owner reference retrieval",
146+
namespace: "test-namespace",
147+
objects: []client.Object{
148+
&appsv1.Deployment{
149+
ObjectMeta: metav1.ObjectMeta{
150+
Name: "opentelemetry-operator",
151+
Namespace: "test-namespace",
152+
UID: "test-uid",
153+
Labels: map[string]string{
154+
"app.kubernetes.io/name": "opentelemetry-operator",
155+
"control-plane": "controller-manager",
156+
},
157+
},
158+
},
159+
},
160+
want: metav1.OwnerReference{
161+
APIVersion: "apps/v1",
162+
Kind: "Deployment",
163+
Name: "opentelemetry-operator",
164+
UID: "test-uid",
165+
},
166+
wantErr: false,
167+
},
168+
{
169+
name: "no deployments found",
170+
namespace: "test-namespace",
171+
objects: []client.Object{},
172+
want: metav1.OwnerReference{},
173+
wantErr: true,
174+
},
175+
}
176+
177+
for _, tt := range tests {
178+
t.Run(tt.name, func(t *testing.T) {
179+
scheme := runtime.NewScheme()
180+
_ = appsv1.AddToScheme(scheme)
181+
fakeClient := fake.NewClientBuilder().
182+
WithScheme(scheme).
183+
WithObjects(tt.objects...).
184+
Build()
185+
186+
om := OperatorMetrics{
187+
kubeClient: fakeClient,
188+
log: logr.Discard(),
189+
}
190+
191+
got, err := om.getOwnerReferences(context.Background(), tt.namespace)
192+
if (err != nil) != tt.wantErr {
193+
t.Errorf("getOwnerReferences() error = %v, wantErr %v", err, tt.wantErr)
194+
return
195+
}
196+
if !reflect.DeepEqual(got, tt.want) {
197+
t.Errorf("getOwnerReferences() got = %v, want %v", got, tt.want)
198+
}
199+
})
200+
}
201+
}

main.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,7 @@ func main() {
424424
}
425425

426426
if cfg.PrometheusCRAvailability() == prometheus.Available {
427-
operatorMetrics, opError := operatormetrics.NewOperatorMetrics(mgr.GetConfig(), scheme)
427+
operatorMetrics, opError := operatormetrics.NewOperatorMetrics(mgr.GetConfig(), scheme, ctrl.Log.WithName("operator-metrics-sm"))
428428
if opError != nil {
429429
setupLog.Error(opError, "Failed to create the operator metrics SM")
430430
}

0 commit comments

Comments
 (0)