From e3e38ebf95d29d0b3259962cf8e16e4cd650f358 Mon Sep 17 00:00:00 2001 From: itsomri Date: Thu, 25 Dec 2025 15:29:11 +0200 Subject: [PATCH] Added the option to disable service monitor creation --- CHANGELOG.md | 3 + .../operands/prometheus/prometheus_test.go | 129 ++++++++++++++++++ pkg/operator/operands/prometheus/resources.go | 6 + 3 files changed, 138 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7dcbc8d30..1a3c92cae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ## [Unreleased] +### Added +- Added the option to disable prometheus service monitor creation [#810](https://github.com/NVIDIA/KAI-Scheduler/pull/810) [itsomri](https://github.com/itsomri) + ## [v0.12.0] - 2025-12-24 ### Added diff --git a/pkg/operator/operands/prometheus/prometheus_test.go b/pkg/operator/operands/prometheus/prometheus_test.go index 8b844a4c8..0054a0f7b 100644 --- a/pkg/operator/operands/prometheus/prometheus_test.go +++ b/pkg/operator/operands/prometheus/prometheus_test.go @@ -955,6 +955,135 @@ var _ = Describe("External Prometheus Validation", func() { }) }) +var _ = Describe("serviceMonitorsForKAIConfig", func() { + var ( + fakeKubeClient client.Client + kaiConfig *kaiv1.Config + ) + + BeforeEach(func(ctx context.Context) { + fakeKubeClient = createFakeClientWithScheme() + kaiConfig = kaiConfigForPrometheus() + }) + + Context("when ServiceMonitors are enabled by default", func() { + BeforeEach(func(ctx context.Context) { + Expect(fakeKubeClient.Create(ctx, getServiceMonitorCRD())).To(Succeed()) + }) + + It("should create ServiceMonitors when Enabled is not explicitly set (nil)", func(ctx context.Context) { + // ServiceMonitor.Enabled is nil by default in kaiConfigForPrometheus + kaiConfig.Spec.Prometheus.ServiceMonitor.Enabled = nil + + objects, err := serviceMonitorsForKAIConfig(ctx, fakeKubeClient, kaiConfig) + Expect(err).To(BeNil()) + Expect(len(objects)).To(BeNumerically(">", 0), "Expected ServiceMonitors to be created by default") + + // Verify we have ServiceMonitor objects + serviceMonitor := test_utils.FindTypeInObjects[*monitoringv1.ServiceMonitor](objects) + Expect(serviceMonitor).NotTo(BeNil(), "Expected at least one ServiceMonitor") + }) + + It("should create ServiceMonitors when Enabled is explicitly set to true", func(ctx context.Context) { + kaiConfig.Spec.Prometheus.ServiceMonitor.Enabled = ptr.To(true) + + objects, err := serviceMonitorsForKAIConfig(ctx, fakeKubeClient, kaiConfig) + Expect(err).To(BeNil()) + Expect(len(objects)).To(BeNumerically(">", 0), "Expected ServiceMonitors to be created when enabled") + + // Verify we have ServiceMonitor objects + serviceMonitor := test_utils.FindTypeInObjects[*monitoringv1.ServiceMonitor](objects) + Expect(serviceMonitor).NotTo(BeNil(), "Expected at least one ServiceMonitor") + }) + + It("should create ServiceMonitors with correct labels", func(ctx context.Context) { + kaiConfig.Spec.Prometheus.ServiceMonitor.Enabled = ptr.To(true) + + objects, err := serviceMonitorsForKAIConfig(ctx, fakeKubeClient, kaiConfig) + Expect(err).To(BeNil()) + Expect(len(objects)).To(BeNumerically(">", 0)) + + // Verify all ServiceMonitors have the correct accounting label + for _, obj := range objects { + if sm, ok := obj.(*monitoringv1.ServiceMonitor); ok { + Expect(sm.Labels).To(HaveKeyWithValue(serviceMonitorAccountingLabel, serviceMonitorAccountingValue)) + } + } + }) + }) + + Context("when ServiceMonitors are explicitly disabled", func() { + BeforeEach(func(ctx context.Context) { + Expect(fakeKubeClient.Create(ctx, getServiceMonitorCRD())).To(Succeed()) + }) + + It("should NOT create ServiceMonitors when Enabled is set to false", func(ctx context.Context) { + kaiConfig.Spec.Prometheus.ServiceMonitor.Enabled = ptr.To(false) + + objects, err := serviceMonitorsForKAIConfig(ctx, fakeKubeClient, kaiConfig) + Expect(err).To(BeNil()) + Expect(len(objects)).To(Equal(0), "Expected no ServiceMonitors to be created when disabled") + }) + + It("should log that ServiceMonitors are disabled", func(ctx context.Context) { + kaiConfig.Spec.Prometheus.ServiceMonitor.Enabled = ptr.To(false) + + objects, err := serviceMonitorsForKAIConfig(ctx, fakeKubeClient, kaiConfig) + Expect(err).To(BeNil()) + Expect(objects).To(BeEmpty()) + }) + }) + + Context("when ServiceMonitor config is nil", func() { + BeforeEach(func(ctx context.Context) { + Expect(fakeKubeClient.Create(ctx, getServiceMonitorCRD())).To(Succeed()) + }) + + It("should create ServiceMonitors when ServiceMonitor config is nil (default enabled)", func(ctx context.Context) { + kaiConfig.Spec.Prometheus.ServiceMonitor = nil + + objects, err := serviceMonitorsForKAIConfig(ctx, fakeKubeClient, kaiConfig) + Expect(err).To(BeNil()) + Expect(len(objects)).To(BeNumerically(">", 0), "Expected ServiceMonitors to be created by default when config is nil") + }) + }) + + Context("when ServiceMonitor CRD is not available", func() { + It("should return empty objects list even if ServiceMonitors are enabled", func(ctx context.Context) { + kaiConfig.Spec.Prometheus.ServiceMonitor.Enabled = ptr.To(true) + + objects, err := serviceMonitorsForKAIConfig(ctx, fakeKubeClient, kaiConfig) + Expect(err).To(BeNil()) + Expect(len(objects)).To(Equal(0), "Expected no ServiceMonitors when CRD is not available") + }) + }) + + Context("when ServiceMonitor has custom configuration", func() { + BeforeEach(func(ctx context.Context) { + Expect(fakeKubeClient.Create(ctx, getServiceMonitorCRD())).To(Succeed()) + }) + + It("should apply custom interval and scrape timeout", func(ctx context.Context) { + kaiConfig.Spec.Prometheus.ServiceMonitor.Enabled = ptr.To(true) + kaiConfig.Spec.Prometheus.ServiceMonitor.Interval = ptr.To("60s") + kaiConfig.Spec.Prometheus.ServiceMonitor.ScrapeTimeout = ptr.To("30s") + + objects, err := serviceMonitorsForKAIConfig(ctx, fakeKubeClient, kaiConfig) + Expect(err).To(BeNil()) + Expect(len(objects)).To(BeNumerically(">", 0)) + + // Verify configuration is applied + for _, obj := range objects { + if sm, ok := obj.(*monitoringv1.ServiceMonitor); ok { + Expect(sm.Spec.Endpoints).To(HaveLen(1)) + Expect(string(sm.Spec.Endpoints[0].Interval)).To(Equal("60s")) + Expect(string(sm.Spec.Endpoints[0].ScrapeTimeout)).To(Equal("30s")) + } + } + }) + }) +}) + func getServiceMonitorCRD() *metav1.PartialObjectMetadata { serviceMonitorCRD := &metav1.PartialObjectMetadata{ TypeMeta: metav1.TypeMeta{ diff --git a/pkg/operator/operands/prometheus/resources.go b/pkg/operator/operands/prometheus/resources.go index 1ff6ddf17..725630001 100644 --- a/pkg/operator/operands/prometheus/resources.go +++ b/pkg/operator/operands/prometheus/resources.go @@ -200,6 +200,12 @@ func serviceMonitorsForKAIConfig( logger := log.FromContext(ctx) config := kaiConfig.Spec.Prometheus + // Check if ServiceMonitors are enabled + if config.ServiceMonitor != nil && config.ServiceMonitor.Enabled != nil && !*config.ServiceMonitor.Enabled { + logger.Info("ServiceMonitors are disabled, skipping ServiceMonitor creation") + return []client.Object{}, nil + } + // Check if ServiceMonitor CRD is available hasServiceMonitorCRD, err := common.CheckPrometheusCRDsAvailable(ctx, runtimeClient, "serviceMonitor") if err != nil {