diff --git a/cmd/gce-pd-csi-driver/main.go b/cmd/gce-pd-csi-driver/main.go index ae554360f..4afda4da7 100644 --- a/cmd/gce-pd-csi-driver/main.go +++ b/cmd/gce-pd-csi-driver/main.go @@ -149,13 +149,22 @@ func handle() { } var metricsManager *metrics.MetricsManager = nil - if *runControllerService && *httpEndpoint != "" { + runServiceWithMetrics := *runControllerService || *runNodeService + if runServiceWithMetrics && *httpEndpoint != "" { mm := metrics.NewMetricsManager() mm.InitializeHttpHandler(*httpEndpoint, *metricsPath) - mm.RegisterPDCSIMetric() - if metrics.IsGKEComponentVersionAvailable() { - mm.EmitGKEComponentVersion() + switch { + case *runControllerService: + mm.RegisterPDCSIMetric() + if metrics.IsGKEComponentVersionAvailable() { + mm.EmitGKEComponentVersion() + } + case *runNodeService: + if err := mm.EmmitProcessStartTime(); err != nil { + klog.Errorf("Failed to emit process start time: %v", err.Error()) + } + mm.RegisterMountMetric() } metricsManager = &mm } @@ -250,6 +259,7 @@ func handle() { DeviceInUseTimeout: *deviceInUseTimeout, EnableDataCache: *enableDataCacheFlag, DataCacheEnabledNodePool: isDataCacheEnabledNodePool(ctx, *nodeName), + MetricsManager: metricsManager, } nodeServer = driver.NewNodeServer(gceDriver, mounter, deviceUtils, meta, statter, nsArgs) if *maxConcurrentFormatAndMount > 0 { diff --git a/pkg/gce-pd-csi-driver/gce-pd-driver.go b/pkg/gce-pd-csi-driver/gce-pd-driver.go index b92702089..b39537f1c 100644 --- a/pkg/gce-pd-csi-driver/gce-pd-driver.go +++ b/pkg/gce-pd-csi-driver/gce-pd-driver.go @@ -157,6 +157,7 @@ func NewNodeServer(gceDriver *GCEDriver, mounter *mount.SafeFormatAndMount, devi deviceInUseErrors: newDeviceErrMap(args.DeviceInUseTimeout), EnableDataCache: args.EnableDataCache, DataCacheEnabledNodePool: args.DataCacheEnabledNodePool, + metricsManager: args.MetricsManager, } } diff --git a/pkg/gce-pd-csi-driver/node.go b/pkg/gce-pd-csi-driver/node.go index cbd33cb89..076d80c68 100644 --- a/pkg/gce-pd-csi-driver/node.go +++ b/pkg/gce-pd-csi-driver/node.go @@ -36,6 +36,7 @@ import ( "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common" "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/deviceutils" metadataservice "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/gce-cloud-provider/metadata" + "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/metrics" mountmanager "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/mount-manager" "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/resizefs" ) @@ -72,6 +73,8 @@ type GCENodeServer struct { // Embed UnimplementedNodeServer to ensure the driver returns Unimplemented for any // new RPC methods that might be introduced in future versions of the spec. csi.UnimplementedNodeServer + + metricsManager *metrics.MetricsManager } type NodeServerArgs struct { @@ -84,6 +87,8 @@ type NodeServerArgs struct { EnableDataCache bool DataCacheEnabledNodePool bool + + MetricsManager *metrics.MetricsManager } var _ csi.NodeServer = &GCENodeServer{} @@ -416,6 +421,10 @@ func (ns *GCENodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStage return &csi.NodeStageVolumeResponse{}, nil } } + + if ns.metricsManager != nil { + ns.metricsManager.RecordMountErrorMetric(err) + } return nil, status.Error(codes.Internal, fmt.Sprintf("Failed to format and mount device from (%q) to (%q) with fstype (%q) and options (%q): %v", devicePath, stagingTargetPath, fstype, options, err.Error())) diff --git a/pkg/gce-pd-csi-driver/node_test.go b/pkg/gce-pd-csi-driver/node_test.go index ee016df0c..4081d4fdd 100644 --- a/pkg/gce-pd-csi-driver/node_test.go +++ b/pkg/gce-pd-csi-driver/node_test.go @@ -52,7 +52,7 @@ func getTestGCEDriverWithCustomMounter(t *testing.T, mounter *mount.SafeFormatAn func getCustomTestGCEDriver(t *testing.T, mounter *mount.SafeFormatAndMount, deviceUtils deviceutils.DeviceUtils, metaService metadataservice.MetadataService) *GCEDriver { gceDriver := GetGCEDriver() enableDataCache := false - nodeServer := NewNodeServer(gceDriver, mounter, deviceUtils, metaService, mountmanager.NewFakeStatter(mounter), NodeServerArgs{true, 0, enableDataCache, false /*dataCacheEnableNodePool */}) + nodeServer := NewNodeServer(gceDriver, mounter, deviceUtils, metaService, mountmanager.NewFakeStatter(mounter), NodeServerArgs{true, 0, enableDataCache, false /*dataCacheEnableNodePool */, nil /*metricsMangager*/}) err := gceDriver.SetupGCEDriver(driver, "test-vendor", nil, nil, nil, nil, nodeServer) if err != nil { t.Fatalf("Failed to setup GCE Driver: %v", err) @@ -63,7 +63,7 @@ func getCustomTestGCEDriver(t *testing.T, mounter *mount.SafeFormatAndMount, dev func getTestBlockingMountGCEDriver(t *testing.T, readyToExecute chan chan struct{}) *GCEDriver { gceDriver := GetGCEDriver() mounter := mountmanager.NewFakeSafeBlockingMounter(readyToExecute) - nodeServer := NewNodeServer(gceDriver, mounter, deviceutils.NewFakeDeviceUtils(false), metadataservice.NewFakeService(), mountmanager.NewFakeStatter(mounter), NodeServerArgs{true, 0, true, false /*dataCacheEnableNodePool */}) + nodeServer := NewNodeServer(gceDriver, mounter, deviceutils.NewFakeDeviceUtils(false), metadataservice.NewFakeService(), mountmanager.NewFakeStatter(mounter), NodeServerArgs{true, 0, true, false /*dataCacheEnableNodePool */, nil /*metricsMangager*/}) err := gceDriver.SetupGCEDriver(driver, "test-vendor", nil, nil, nil, nil, nodeServer) if err != nil { t.Fatalf("Failed to setup GCE Driver: %v", err) @@ -75,7 +75,7 @@ func getTestBlockingFormatAndMountGCEDriver(t *testing.T, readyToExecute chan ch gceDriver := GetGCEDriver() enableDataCache := true mounter := mountmanager.NewFakeSafeBlockingMounter(readyToExecute) - nodeServer := NewNodeServer(gceDriver, mounter, deviceutils.NewFakeDeviceUtils(false), metadataservice.NewFakeService(), mountmanager.NewFakeStatter(mounter), NodeServerArgs{true, 0, enableDataCache, false /*dataCacheEnableNodePool */}).WithSerializedFormatAndMount(5*time.Second, 1) + nodeServer := NewNodeServer(gceDriver, mounter, deviceutils.NewFakeDeviceUtils(false), metadataservice.NewFakeService(), mountmanager.NewFakeStatter(mounter), NodeServerArgs{true, 0, enableDataCache, false /*dataCacheEnableNodePool */, nil /*metricsMangager*/}).WithSerializedFormatAndMount(5*time.Second, 1) err := gceDriver.SetupGCEDriver(driver, "test-vendor", nil, nil, nil, nil, nodeServer) if err != nil { diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 600a1b2f0..7e6d4d2cd 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -25,6 +25,7 @@ import ( "google.golang.org/grpc/codes" "k8s.io/component-base/metrics" "k8s.io/klog/v2" + "k8s.io/mount-utils" "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common" ) @@ -53,6 +54,15 @@ var ( StabilityLevel: metrics.ALPHA, }, []string{"driver_name", "method_name", "grpc_status_code", "disk_type", "enable_confidential_storage", "enable_storage_pools"}) + + mountErrorMetric = metrics.NewCounterVec(&metrics.CounterOpts{ + Subsystem: "node", + Name: "mount_errors", + Help: "Node server file system mounting errors", + StabilityLevel: metrics.ALPHA, + }, + []string{"error_type"}, + ) ) type MetricsManager struct { @@ -78,6 +88,10 @@ func (mm *MetricsManager) RegisterPDCSIMetric() { mm.registry.MustRegister(pdcsiOperationErrorsMetric) } +func (mm *MetricsManager) RegisterMountMetric() { + mm.registry.MustRegister(mountErrorMetric) +} + func (mm *MetricsManager) recordComponentVersionMetric() error { v := getEnvVar(envGKEPDCSIVersion) if v == "" { @@ -101,6 +115,19 @@ func (mm *MetricsManager) RecordOperationErrorMetrics( klog.Infof("Recorded PDCSI operation error code: %q", errCode) } +func (mm *MetricsManager) RecordMountErrorMetric(err error) { + mntErr := &mount.MountError{} + if errors.As(err, mntErr) { + mountErrorMetric.WithLabelValues(string(mntErr.Type)).Inc() + } + + klog.Infof("Recorded mount error type: %q", mntErr.Type) +} + +func (mm *MetricsManager) EmmitProcessStartTime() error { + return metrics.RegisterProcessStartTime(mm.registry.Register) +} + func (mm *MetricsManager) EmitGKEComponentVersion() error { mm.registerComponentVersionMetric() if err := mm.recordComponentVersionMetric(); err != nil {