Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix error when the operator metrics ServiceMonitor already exists #3447

Merged
merged 3 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .chloggen/3446.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: 'bug_fix'

# The name of the component, or a single word describing the area of concern, (e.g. collector, target allocator, auto-instrumentation, opamp, github action)
component: operator

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Operator pod crashed if the Service Monitor for the operator metrics was created before by another operator pod.

# One or more tracking issues related to the change
issues: [3446]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext: |
Operator fails when the pod is restarted and the Service Monitor for operator metrics was already created by another operator pod.
To fix this, the operator now sets the owner reference on the Service Monitor to itself and checks if the Service Monitor already exists.

51 changes: 48 additions & 3 deletions internal/operator-metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@ import (
"fmt"
"os"

"github.com/go-logr/logr"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/intstr"
Expand Down Expand Up @@ -51,26 +54,35 @@ var _ manager.Runnable = &OperatorMetrics{}

type OperatorMetrics struct {
kubeClient client.Client
log logr.Logger
}

func NewOperatorMetrics(config *rest.Config, scheme *runtime.Scheme) (OperatorMetrics, error) {
func NewOperatorMetrics(config *rest.Config, scheme *runtime.Scheme, log logr.Logger) (OperatorMetrics, error) {
kubeClient, err := client.New(config, client.Options{Scheme: scheme})
if err != nil {
return OperatorMetrics{}, err
}

return OperatorMetrics{
kubeClient: kubeClient,
log: log,
}, nil
}

func (om OperatorMetrics) Start(ctx context.Context) error {
rawNamespace, err := os.ReadFile(namespaceFile)
if err != nil {
return fmt.Errorf("error reading namespace file: %w", err)
om.log.Error(err, "error reading namespace file. No ServiceMonitor will be created")
return nil
}
namespace := string(rawNamespace)

ownerRef, err := om.getOwnerReferences(ctx, namespace)
if err != nil {
om.log.Error(err, "error getting owner references. No ServiceMonitor will be created")
return nil
}

var tlsConfig *monitoringv1.TLSConfig

if om.caConfigMapExists() {
Expand Down Expand Up @@ -101,6 +113,7 @@ func (om OperatorMetrics) Start(ctx context.Context) error {
"app.kubernetes.io/part-of": "opentelemetry-operator",
"control-plane": "controller-manager",
},
OwnerReferences: []metav1.OwnerReference{ownerRef},
},
Spec: monitoringv1.ServiceMonitorSpec{
Selector: metav1.LabelSelector{
Expand All @@ -123,7 +136,8 @@ func (om OperatorMetrics) Start(ctx context.Context) error {
}

err = om.kubeClient.Create(ctx, &sm)
if err != nil {
// The ServiceMonitor can be already there if this is a restart
if err != nil && !apierrors.IsAlreadyExists(err) {
return fmt.Errorf("error creating service monitor: %w", err)
}

Expand All @@ -143,3 +157,34 @@ func (om OperatorMetrics) caConfigMapExists() bool {
}, &corev1.ConfigMap{},
) == nil
}

func (om OperatorMetrics) getOwnerReferences(ctx context.Context, namespace string) (metav1.OwnerReference, error) {
var deploymentList appsv1.DeploymentList

listOptions := []client.ListOption{
client.InNamespace(namespace),
client.MatchingLabels(map[string]string{
"app.kubernetes.io/name": "opentelemetry-operator",
"control-plane": "controller-manager",
}),
}

err := om.kubeClient.List(ctx, &deploymentList, listOptions...)
if err != nil {
return metav1.OwnerReference{}, err
}

if len(deploymentList.Items) == 0 {
return metav1.OwnerReference{}, fmt.Errorf("no deployments found with the specified label")
}
deployment := &deploymentList.Items[0]

ownerRef := metav1.OwnerReference{
APIVersion: "apps/v1",
Kind: "Deployment",
Name: deployment.Name,
UID: deployment.UID,
}

return ownerRef, nil
}
86 changes: 80 additions & 6 deletions internal/operator-metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,30 @@ package operatormetrics
import (
"context"
"os"
"reflect"
"testing"
"time"

"github.com/go-logr/logr"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/rest"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
)

func TestNewOperatorMetrics(t *testing.T) {
config := &rest.Config{}
scheme := runtime.NewScheme()
metrics, err := NewOperatorMetrics(config, scheme)
metrics, err := NewOperatorMetrics(config, scheme, logr.Discard())
assert.NoError(t, err)
assert.NotNil(t, metrics.kubeClient)
}
Expand All @@ -53,12 +57,15 @@ func TestOperatorMetrics_Start(t *testing.T) {
namespaceFile = tmpFile.Name()

scheme := runtime.NewScheme()
err = corev1.AddToScheme(scheme)
require.NoError(t, err)
err = monitoringv1.AddToScheme(scheme)
require.NoError(t, err)
require.NoError(t, corev1.AddToScheme(scheme))
require.NoError(t, appsv1.AddToScheme(scheme))
require.NoError(t, monitoringv1.AddToScheme(scheme))

client := fake.NewClientBuilder().WithScheme(scheme).Build()
client := fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects(
&appsv1.Deployment{
ObjectMeta: metav1.ObjectMeta{Name: "opentelemetry-operator", Namespace: "test-namespace", Labels: map[string]string{"app.kubernetes.io/name": "opentelemetry-operator", "control-plane": "controller-manager"}},
},
).Build()

metrics := OperatorMetrics{kubeClient: client}

Expand Down Expand Up @@ -125,3 +132,70 @@ func TestOperatorMetrics_caConfigMapExists(t *testing.T) {
metricsWithoutConfigMap := OperatorMetrics{kubeClient: clientWithoutConfigMap}
assert.False(t, metricsWithoutConfigMap.caConfigMapExists())
}

func TestOperatorMetrics_getOwnerReferences(t *testing.T) {
tests := []struct {
name string
namespace string
objects []client.Object
want metav1.OwnerReference
wantErr bool
}{
{
name: "successful owner reference retrieval",
namespace: "test-namespace",
objects: []client.Object{
&appsv1.Deployment{
ObjectMeta: metav1.ObjectMeta{
Name: "opentelemetry-operator",
Namespace: "test-namespace",
UID: "test-uid",
Labels: map[string]string{
"app.kubernetes.io/name": "opentelemetry-operator",
"control-plane": "controller-manager",
},
},
},
},
want: metav1.OwnerReference{
APIVersion: "apps/v1",
Kind: "Deployment",
Name: "opentelemetry-operator",
UID: "test-uid",
},
wantErr: false,
},
{
name: "no deployments found",
namespace: "test-namespace",
objects: []client.Object{},
want: metav1.OwnerReference{},
wantErr: true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
scheme := runtime.NewScheme()
_ = appsv1.AddToScheme(scheme)
fakeClient := fake.NewClientBuilder().
WithScheme(scheme).
WithObjects(tt.objects...).
Build()

om := OperatorMetrics{
kubeClient: fakeClient,
log: logr.Discard(),
}

got, err := om.getOwnerReferences(context.Background(), tt.namespace)
if (err != nil) != tt.wantErr {
t.Errorf("getOwnerReferences() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("getOwnerReferences() got = %v, want %v", got, tt.want)
}
})
}
}
2 changes: 1 addition & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ func main() {
}

if cfg.PrometheusCRAvailability() == prometheus.Available {
operatorMetrics, opError := operatormetrics.NewOperatorMetrics(mgr.GetConfig(), scheme)
operatorMetrics, opError := operatormetrics.NewOperatorMetrics(mgr.GetConfig(), scheme, ctrl.Log.WithName("operator-metrics-sm"))
if opError != nil {
setupLog.Error(opError, "Failed to create the operator metrics SM")
}
Expand Down
Loading