Skip to content

Commit

Permalink
Merge pull request #1947 from marquiz/devel/health-topology-updater
Browse files Browse the repository at this point in the history
nfd-topology-updater: serve metrics and healthz on the same port
  • Loading branch information
k8s-ci-robot authored Feb 17, 2025
2 parents 2653488 + 0b9a8cf commit 797c66f
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 78 deletions.
6 changes: 2 additions & 4 deletions cmd/nfd-topology-updater/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,8 @@ func initFlags(flagset *flag.FlagSet) (*topology.Args, *resourcemonitor.Args) {
"Do not create or update NodeResourceTopology objects.")
flagset.StringVar(&args.KubeConfigFile, "kubeconfig", "",
"Kube config file.")
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
"Port on which to expose metrics.")
flagset.IntVar(&args.GrpcHealthPort, "grpc-health", 8082,
"Port on which to expose the grpc health endpoint.")
flagset.IntVar(&args.Port, "port", 8080,
"Port which metrics and healthz endpoints are served on")
flagset.DurationVar(&resourcemonitorArgs.SleepInterval, "sleep-interval", time.Duration(60)*time.Second,
"Time to sleep between CR updates. zero means no CR updates on interval basis. [Default: 60s]")
flagset.StringVar(&resourcemonitorArgs.Namespace, "watch-namespace", "*",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@ spec:
image: gcr.io/k8s-staging-nfd/node-feature-discovery:master
imagePullPolicy: Always
livenessProbe:
grpc:
port: 8082
httpGet:
path: /healthz
port: http
initialDelaySeconds: 10
periodSeconds: 10
readinessProbe:
grpc:
port: 8082
httpGet:
path: /healthz
port: http
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 10
Expand All @@ -41,5 +43,5 @@ spec:
cpu: 50m
memory: 40Mi
ports:
- name: metrics
containerPort: 8081
- name: http
containerPort: 8080
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,9 @@ spec:
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: "{{ .Values.image.pullPolicy }}"
livenessProbe:
grpc:
port: {{ .Values.topologyUpdater.healthPort | default "8082" }}
httpGet:
path: /healthz
port: http
{{- with .Values.topologyUpdater.livenessProbe.initialDelaySeconds }}
initialDelaySeconds: {{ . }}
{{- end }}
Expand All @@ -60,8 +61,9 @@ spec:
timeoutSeconds: {{ . }}
{{- end }}
readinessProbe:
grpc:
port: {{ .Values.topologyUpdater.healthPort | default "8082" }}
httpGet:
path: /healthz
port: http
{{- with .Values.topologyUpdater.readinessProbe.initialDelaySeconds }}
initialDelaySeconds: {{ . }}
{{- end }}
Expand Down Expand Up @@ -113,16 +115,13 @@ spec:
# Disable kubelet state tracking by giving an empty path
- "-kubelet-state-dir="
{{- end }}
- "-metrics={{ .Values.topologyUpdater.metricsPort | default "8081"}}"
- "-grpc-health={{ .Values.topologyUpdater.healthPort | default "8082" }}"
- "-port={{ .Values.topologyUpdater.port | default "8080"}}"
{{- with .Values.topologyUpdater.extraArgs }}
{{- toYaml . | nindent 10 }}
{{- end }}
ports:
- containerPort: {{ .Values.topologyUpdater.metricsPort | default "8081"}}
name: metrics
- containerPort: {{ .Values.topologyUpdater.healthPort | default "8082" }}
name: health
- containerPort: {{ .Values.topologyUpdater.port | default "8080"}}
name: http
volumeMounts:
{{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }}
- name: kubelet-config
Expand Down
9 changes: 2 additions & 7 deletions deployment/helm/node-feature-discovery/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -502,8 +502,7 @@ topologyUpdater:
rbac:
create: true

metricsPort: 8081
healthPort: 8082
port: 8080
kubeletConfigPath:
kubeletPodResourcesSockPath:
updateInterval: 60s
Expand All @@ -517,17 +516,13 @@ topologyUpdater:
drop: [ "ALL" ]
readOnlyRootFilesystem: true
runAsUser: 0

livenessProbe:
grpc:
port: 8082
initialDelaySeconds: 10
# failureThreshold: 3
# periodSeconds: 10
# timeoutSeconds: 1
readinessProbe:
grpc:
port: 8082
initialDelaySeconds: 5
failureThreshold: 10
# periodSeconds: 10
Expand Down
3 changes: 1 addition & 2 deletions docs/deployment/helm.md
Original file line number Diff line number Diff line change
Expand Up @@ -274,8 +274,7 @@ API's you need to install the prometheus operator in your cluster.
| `topologyUpdater.serviceAccount.annotations` | dict | {} | Annotations to add to the service account for topology updater |
| `topologyUpdater.serviceAccount.name` | string | | The name of the service account for topology updater to use. If not set and create is true, a name is generated using the fullname template and `-topology-updater` suffix |
| `topologyUpdater.rbac.create` | bool | true | Specifies whether to create [RBAC][rbac] configuration for topology updater |
| `topologyUpdater.metricsPort` | integer | 8081 | Port on which to expose prometheus metrics. **DEPRECATED**: will be replaced by `topologyUpdater.port` in NFD v0.18. |
| `topologyUpdater.healthPort` | integer | 8082 | Port on which to expose the grpc health endpoint, will be also used for the probes. **DEPRECATED**: will be replaced by `topologyUpdater.port` in NFD v0.18. |
| `topologyUpdater.port` | integer | 8080 | Port on which to serve http for metrics and healthz endpoints. |
| `topologyUpdater.kubeletConfigPath` | string | "" | Specifies the kubelet config host path |
| `topologyUpdater.kubeletPodResourcesSockPath` | string | "" | Specifies the kubelet sock path to read pod resources |
| `topologyUpdater.updateInterval` | string | 60s | Time to sleep between CR updates. Non-positive value implies no CR update. |
Expand Down
70 changes: 21 additions & 49 deletions pkg/nfd-topology-updater/nfd-topology-updater.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,13 @@ package nfdtopologyupdater

import (
"fmt"
"net"
"net/http"
"net/url"
"os"
"path/filepath"

"golang.org/x/net/context"

"google.golang.org/grpc"
"google.golang.org/grpc/health"
"google.golang.org/grpc/health/grpc_health_v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
Expand All @@ -37,6 +34,7 @@ import (

"github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha2"
topologyclientset "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/generated/clientset/versioned"
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/node-feature-discovery/pkg/nfd-topology-updater/kubeletnotifier"
"sigs.k8s.io/node-feature-discovery/pkg/podres"
"sigs.k8s.io/node-feature-discovery/pkg/resourcemonitor"
Expand All @@ -56,13 +54,12 @@ const (

// Args are the command line arguments
type Args struct {
MetricsPort int
Port int
NoPublish bool
Oneshot bool
KubeConfigFile string
ConfigFile string
KubeletStateDir string
GrpcHealthPort int

Klog map[string]*utils.KlogFlagVal
}
Expand Down Expand Up @@ -90,7 +87,6 @@ type nfdTopologyUpdater struct {
ownerRefs []metav1.OwnerReference
k8sClient k8sclient.Interface
kubeletConfigFunc func() (*kubeletconfigv1beta1.KubeletConfiguration, error)
healthServer *grpc.Server
}

// NewTopologyUpdater creates a new NfdTopologyUpdater instance.
Expand Down Expand Up @@ -134,27 +130,8 @@ func (w *nfdTopologyUpdater) detectTopologyPolicyAndScope() (string, string, err
return klConfig.TopologyManagerPolicy, klConfig.TopologyManagerScope, nil
}

func (w *nfdTopologyUpdater) startGrpcHealthServer(errChan chan<- error) error {
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", w.args.GrpcHealthPort))
if err != nil {
return fmt.Errorf("failed to listen: %w", err)
}

s := grpc.NewServer()
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
klog.InfoS("gRPC health server serving", "port", w.args.GrpcHealthPort)

go func() {
defer func() {
lis.Close()
}()
if err := s.Serve(lis); err != nil {
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
}
klog.InfoS("gRPC health server stopped")
}()
w.healthServer = s
return nil
func (w *nfdTopologyUpdater) Healthz(writer http.ResponseWriter, _ *http.Request) {
writer.WriteHeader(http.StatusOK)
}

// Run nfdTopologyUpdater. Returns if a fatal error is encountered, or, after
Expand Down Expand Up @@ -187,15 +164,14 @@ func (w *nfdTopologyUpdater) Run() error {
return fmt.Errorf("faild to configure Node Feature Discovery Topology Updater: %w", err)
}

httpMux := http.NewServeMux()

// Register to metrics server
if w.args.MetricsPort > 0 {
m := utils.CreateMetricsServer(w.args.MetricsPort,
buildInfo,
scanErrors)
go m.Run()
registerVersion(version.Get())
defer m.Stop()
}
promRegistry := prometheus.NewRegistry()
promRegistry.MustRegister(
buildInfo,
scanErrors)
registerVersion(version.Get())

var resScan resourcemonitor.ResourcesScanner

Expand All @@ -215,20 +191,19 @@ func (w *nfdTopologyUpdater) Run() error {
return fmt.Errorf("failed to obtain node resource information: %w", err)
}

grpcErr := make(chan error)
// Register health probe (at this point we're "ready and live")
httpMux.HandleFunc("/healthz", w.Healthz)

// Start gRPC server for liveness probe (at this point we're "live")
if w.args.GrpcHealthPort != 0 {
if err := w.startGrpcHealthServer(grpcErr); err != nil {
return fmt.Errorf("failed to start gRPC health server: %w", err)
}
}
// Start HTTP server
httpServer := http.Server{Addr: fmt.Sprintf(":%d", w.args.Port), Handler: httpMux}
go func() {
klog.InfoS("http server starting", "port", httpServer.Addr)
klog.InfoS("http server stopped", "exitCode", httpServer.ListenAndServe())
}()
defer httpServer.Close()

for {
select {
case err := <-grpcErr:
return fmt.Errorf("error in serving gRPC: %w", err)

case info := <-w.eventSource:
klog.V(4).InfoS("event received, scanning...", "event", info.Event)
scanResponse, err := resScan.Scan()
Expand Down Expand Up @@ -257,9 +232,6 @@ func (w *nfdTopologyUpdater) Run() error {

case <-w.stop:
klog.InfoS("shutting down nfd-topology-updater")
if w.healthServer != nil {
w.healthServer.GracefulStop()
}
return nil
}
}
Expand Down

0 comments on commit 797c66f

Please sign in to comment.