Skip to content

Commit 859f82a

Browse files
authored
Merge pull request #2223 from k8s-infra-cherrypick-robot/cherry-pick-2193-to-release-1.21
[release-1.21] feat: Instrument node metrics for unexpected device path changes
2 parents 2379b35 + 8d3f056 commit 859f82a

File tree

5 files changed

+39
-12
lines changed

5 files changed

+39
-12
lines changed

cmd/gce-pd-csi-driver/main.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ func handle() {
173173
klog.Errorf("Failed to emit process start time: %v", err.Error())
174174
}
175175
mm.RegisterMountMetric()
176+
mm.RegisterUnexpectedDevicePathChangesMetric()
176177
}
177178
metricsManager = &mm
178179
}
@@ -281,7 +282,7 @@ func handle() {
281282
klog.Fatalf("Failed to get node info from API server: %v", err.Error())
282283
}
283284

284-
deviceCache, err := linkcache.NewDeviceCacheForNode(ctx, *diskCacheSyncPeriod, *nodeName, driverName, deviceUtils)
285+
deviceCache, err := linkcache.NewDeviceCacheForNode(ctx, *diskCacheSyncPeriod, *nodeName, driverName, deviceUtils, metricsManager)
285286
if err != nil {
286287
klog.Warningf("Failed to create device cache: %v", err.Error())
287288
} else {

pkg/linkcache/devices_linux.go

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,21 +12,22 @@ import (
1212
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common"
1313
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/deviceutils"
1414
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/k8sclient"
15+
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/metrics"
1516
)
1617

1718
const byIdDir = "/dev/disk/by-id"
1819

19-
func NewDeviceCacheForNode(ctx context.Context, period time.Duration, nodeName string, driverName string, deviceUtils deviceutils.DeviceUtils) (*DeviceCache, error) {
20+
func NewDeviceCacheForNode(ctx context.Context, period time.Duration, nodeName string, driverName string, deviceUtils deviceutils.DeviceUtils, metricsManager *metrics.MetricsManager) (*DeviceCache, error) {
2021
node, err := k8sclient.GetNodeWithRetry(ctx, nodeName)
2122
if err != nil {
2223
return nil, fmt.Errorf("failed to get node %s: %w", nodeName, err)
2324
}
2425

25-
return newDeviceCacheForNode(period, node, driverName, deviceUtils), nil
26+
return newDeviceCacheForNode(period, node, driverName, deviceUtils, metricsManager), nil
2627
}
2728

2829
func NewTestDeviceCache(period time.Duration, node *v1.Node) *DeviceCache {
29-
return newDeviceCacheForNode(period, node, "pd.csi.storage.gke.io", deviceutils.NewDeviceUtils())
30+
return newDeviceCacheForNode(period, node, "pd.csi.storage.gke.io", deviceutils.NewDeviceUtils(), nil)
3031
}
3132

3233
func NewTestNodeWithVolumes(volumes []string) *v1.Node {
@@ -42,12 +43,13 @@ func NewTestNodeWithVolumes(volumes []string) *v1.Node {
4243
}
4344
}
4445

45-
func newDeviceCacheForNode(period time.Duration, node *v1.Node, driverName string, deviceUtils deviceutils.DeviceUtils) *DeviceCache {
46+
func newDeviceCacheForNode(period time.Duration, node *v1.Node, driverName string, deviceUtils deviceutils.DeviceUtils, metricsManager *metrics.MetricsManager) *DeviceCache {
4647
deviceCache := &DeviceCache{
47-
symlinks: make(map[string]deviceMapping),
48-
period: period,
49-
deviceUtils: deviceUtils,
50-
dir: byIdDir,
48+
symlinks: make(map[string]deviceMapping),
49+
period: period,
50+
deviceUtils: deviceUtils,
51+
dir: byIdDir,
52+
metricsManager: metricsManager,
5153
}
5254

5355
// Look at the status.volumesInUse field. For each, take the last section
@@ -163,6 +165,9 @@ func (d *DeviceCache) listAndUpdate() {
163165
// Check if the realPath has changed
164166
if realPath != device.realPath {
165167
klog.Warningf("Change in device path for volume %s (symlink: %s), previous path: %s, new path: %s", device.volumeID, symlink, device.realPath, realPath)
168+
if d.metricsManager != nil {
169+
d.metricsManager.RecordUnexpectedDevicePathChangesMetric()
170+
}
166171

167172
// Update the cache with the new realPath
168173
device.realPath = realPath

pkg/linkcache/devices_windows.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@ import (
88

99
"k8s.io/klog/v2"
1010
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/deviceutils"
11+
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/metrics"
1112
)
1213

13-
func NewDeviceCacheForNode(ctx context.Context, period time.Duration, nodeName string, driverName string, deviceUtils deviceutils.DeviceUtils) (*DeviceCache, error) {
14+
func NewDeviceCacheForNode(ctx context.Context, period time.Duration, nodeName string, driverName string, deviceUtils deviceutils.DeviceUtils, metricsManager *metrics.MetricsManager) (*DeviceCache, error) {
1415
klog.Infof("NewDeviceCacheForNode is not implemented for Windows")
1516
return nil, nil
1617
}

pkg/linkcache/types.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"time"
66

77
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/deviceutils"
8+
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/metrics"
89
)
910

1011
type deviceMapping struct {
@@ -17,6 +18,7 @@ type DeviceCache struct {
1718
symlinks map[string]deviceMapping
1819
period time.Duration
1920
// dir is the directory to look for device symlinks
20-
dir string
21-
deviceUtils deviceutils.DeviceUtils
21+
dir string
22+
deviceUtils deviceutils.DeviceUtils
23+
metricsManager *metrics.MetricsManager
2224
}

pkg/metrics/metrics.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,15 @@ var (
6363
},
6464
[]string{"driver_name", "file_system_format", "error_type"},
6565
)
66+
67+
unexpectedDevicePathChangesMetric = metrics.NewCounterVec(&metrics.CounterOpts{
68+
Subsystem: "node",
69+
Name: "unexpected_device_path_changes",
70+
Help: "Unexpected device path changes",
71+
StabilityLevel: metrics.ALPHA,
72+
},
73+
[]string{"driver_name"},
74+
)
6675
)
6776

6877
type MetricsManager struct {
@@ -92,6 +101,10 @@ func (mm *MetricsManager) RegisterMountMetric() {
92101
mm.registry.MustRegister(mountErrorMetric)
93102
}
94103

104+
func (mm *MetricsManager) RegisterUnexpectedDevicePathChangesMetric() {
105+
mm.registry.MustRegister(unexpectedDevicePathChangesMetric)
106+
}
107+
95108
func (mm *MetricsManager) recordComponentVersionMetric() error {
96109
v := getEnvVar(envGKEPDCSIVersion)
97110
if v == "" {
@@ -121,6 +134,11 @@ func (mm *MetricsManager) RecordMountErrorMetric(fs_format string, err error) {
121134
klog.Infof("Recorded mount error type: %q", errType)
122135
}
123136

137+
func (mm *MetricsManager) RecordUnexpectedDevicePathChangesMetric() {
138+
unexpectedDevicePathChangesMetric.WithLabelValues(pdcsiDriverName).Inc()
139+
klog.Infof("Recorded unexpected device path change")
140+
}
141+
124142
func (mm *MetricsManager) EmmitProcessStartTime() error {
125143
return metrics.RegisterProcessStartTime(mm.registry.Register)
126144
}

0 commit comments

Comments
 (0)