Skip to content

Commit 242cee2

Browse files
committed
Manually create udev links if udev trigger doesn't work
Change-Id: I26bffb3eda447c8343ce39e69c34cf31616120e3
1 parent bec8db5 commit 242cee2

File tree

5 files changed

+167
-17
lines changed

5 files changed

+167
-17
lines changed

pkg/deviceutils/device-utils.go

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,10 @@ func (m *deviceUtils) VerifyDevicePath(devicePaths []string, deviceName string)
290290
})
291291

292292
if err != nil {
293-
return "", fmt.Errorf("failed to find and re-link disk %s with udevadm after retrying for %v: %w", deviceName, pollTimeout, err)
293+
klog.Warningf("For device %s udevadmin failed: %v. Trying to manually link", deviceName, err)
294+
if err := manuallySetDevicePath(deviceName); err != nil {
295+
return "", fmt.Errorf("failed to manually set link for disk %s: %w", deviceName, err)
296+
}
294297
}
295298

296299
return devicePath, nil
@@ -338,11 +341,11 @@ func findAvailableDevFsPaths() ([]string, error) {
338341
return append(diskSDPaths, diskNvmePaths...), nil
339342
}
340343

341-
func udevadmTriggerForDiskIfExists(deviceName string) error {
344+
func findDevice(deviceName string) (string, string, error) {
342345
devFsPathToSerial := map[string]string{}
343346
devFsPaths, err := findAvailableDevFsPaths()
344347
if err != nil {
345-
return err
348+
return "", "", err
346349
}
347350
for _, devFsPath := range devFsPaths {
348351
devFsSerial, err := getDevFsSerial(devFsPath)
@@ -355,17 +358,33 @@ func udevadmTriggerForDiskIfExists(deviceName string) error {
355358
klog.V(4).Infof("device path %s, serial number %v", devFsPath, devFsSerial)
356359
devFsPathToSerial[devFsPath] = devFsSerial
357360
if devFsSerial == deviceName {
358-
// Found the disk that we're looking for so run a trigger on it
359-
// to resolve its /dev/by-id/ path
360-
klog.Warningf("udevadm --trigger running to fix disk at path %s which has serial number %s", devFsPath, devFsSerial)
361-
err := udevadmChangeToDrive(devFsPath)
362-
if err != nil {
363-
return fmt.Errorf("udevadm --trigger failed to fix device path %s which has serial number %s: %w", devFsPath, devFsSerial, err)
364-
}
365-
return nil
361+
return devFsPath, devFsSerial, nil
366362
}
367363
}
368-
return fmt.Errorf("udevadm --trigger requested to fix disk %s but no such disk was found in device path %v", deviceName, devFsPathToSerial)
364+
return "", "", fmt.Errorf("udevadm --trigger requested to fix disk %s but no such disk was found in device path %v", deviceName, devFsPathToSerial)
365+
}
366+
367+
func manuallySetDevicePath(deviceName string) error {
368+
devFsPath, devFsSerial, err := findDevice(deviceName)
369+
if err != nil {
370+
return err
371+
}
372+
return os.Symlink(devFsPath, path.Join(diskByIdPath, diskGooglePrefix+devFsSerial))
373+
}
374+
375+
func udevadmTriggerForDiskIfExists(deviceName string) error {
376+
devFsPath, devFsSerial, err := findDevice(deviceName)
377+
if err != nil {
378+
return err
379+
}
380+
// Found the disk that we're looking for so run a trigger on it
381+
// to resolve its /dev/by-id/ path
382+
klog.Warningf("udevadm --trigger running to fix disk at path %s which has serial number %s", devFsPath, devFsSerial)
383+
err = udevadmChangeToDrive(devFsPath)
384+
if err != nil {
385+
return fmt.Errorf("udevadm --trigger failed to fix device path %s which has serial number %s: %w", devFsPath, devFsSerial, err)
386+
}
387+
return nil
369388
}
370389

371390
// Calls "udevadm trigger --action=change" on the specified drive. drivePath

test/e2e/tests/setup_e2e_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ var (
4747
serviceAccount = flag.String("service-account", "", "Service account to bring up instance with")
4848
vmNamePrefix = flag.String("vm-name-prefix", "gce-pd-csi-e2e", "VM name prefix")
4949
architecture = flag.String("arch", "amd64", "Architecture pd csi driver build on")
50-
minCpuPlatform = flag.String("min-cpu-platform", "rome", "Minimum CPU architecture")
51-
mwMinCpuPlatform = flag.String("min-cpu-platform-mw", "sapphirerapids", "Minimum CPU architecture for multiwriter tests")
50+
minCpuPlatform = flag.String("min-cpu-platform", "AMD Rome", "Minimum CPU architecture")
51+
mwMinCpuPlatform = flag.String("min-cpu-platform-mw", "Intel Sapphire Rapids", "Minimum CPU architecture for multiwriter tests")
5252
zones = flag.String("zones", "us-east4-a,us-east4-c", "Zones to run tests in. If there are multiple zones, separate each by comma")
5353
machineType = flag.String("machine-type", "n2d-standard-4", "Type of machine to provision instance on")
5454
imageURL = flag.String("image-url", "projects/ubuntu-os-cloud/global/images/family/ubuntu-minimal-2404-lts-amd64", "OS image url to get image from")

test/e2e/tests/single_zone_e2e_test.go

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1668,6 +1668,94 @@ var _ = Describe("GCE PD CSI Driver", func() {
16681668
Expect(err).To(BeNil(), "Failed to rm file path %s: %v", fp, err)
16691669
})
16701670

1671+
It("Should mount if udev disabled, and remount if it's enabled again", func() {
1672+
testContext := getRandomTestContext()
1673+
p, z, _ := testContext.Instance.GetIdentity()
1674+
client := testContext.Client
1675+
instance := testContext.Instance
1676+
1677+
klog.Infof("Disabling udev")
1678+
err := instance.DisableUdev()
1679+
Expect(err).To(BeNil(), "Failed to disable udev")
1680+
1681+
// Create Disk
1682+
volName, volID := createAndValidateUniqueZonalDisk(client, p, z, standardDiskType)
1683+
vol2Name, vol2ID := createAndValidateUniqueZonalDisk(client, p, z, standardDiskType)
1684+
1685+
defer func() {
1686+
// Delete Disks
1687+
err := client.DeleteVolume(volID)
1688+
Expect(err).To(BeNil(), "DeleteVolume failed")
1689+
1690+
err = client.DeleteVolume(vol2ID)
1691+
Expect(err).To(BeNil(), "DeleteVolume failed")
1692+
1693+
// Validate Disks Deleted
1694+
_, err = computeService.Disks.Get(p, z, volName).Do()
1695+
Expect(gce.IsGCEError(err, "notFound")).To(BeTrue(), "Expected disk to not be found")
1696+
_, err = computeService.Disks.Get(p, z, vol2Name).Do()
1697+
Expect(gce.IsGCEError(err, "notFound")).To(BeTrue(), "Expected disk to not be found")
1698+
}()
1699+
1700+
// Attach & detach disk. We retry as we expect the udev repair to take a little bit of time.
1701+
klog.Infof("Starting attach & detach with disabled udev")
1702+
err = wait.Poll(10*time.Second, 5*time.Minute, func() (bool, error) {
1703+
err = testAttachWriteReadDetach(volID, volName, instance, client, false /* readOnly */, false /* detachAndReattach */, false /* setupDataCache */)
1704+
if err != nil {
1705+
klog.Infof("Initial udev error, retrying: %v", err)
1706+
}
1707+
return err == nil, nil
1708+
})
1709+
Expect(err).To(BeNil(), "Failed to go through volume lifecycle")
1710+
1711+
// Attach a different disk. The conflicting udev paths should not cause a problem.
1712+
klog.Infof("Starting second attach & detach with disabled udev")
1713+
err = wait.Poll(10*time.Second, 5*time.Minute, func() (bool, error) {
1714+
err = testAttachWriteReadDetach(vol2ID, vol2Name, instance, client, false /* readOnly */, false /* detachAndReattach */, false /* setupDataCache */)
1715+
if err != nil {
1716+
klog.Infof("second disk udev error, retrying: %v", err)
1717+
}
1718+
return err == nil, nil
1719+
})
1720+
Expect(err).To(BeNil(), "Failed to go through second volume lifecycle")
1721+
1722+
// Attach, reenable udev, go through lifecycle of second disk, detach first
1723+
klog.Infof("Starting attach & udev re-enable")
1724+
var detacher func()
1725+
var args *verifyArgs
1726+
err = wait.Poll(10*time.Second, 5*time.Minute, func() (bool, error) {
1727+
err, detacher, args = testAttachAndMount(volID, volName, instance, client, attachAndMountArgs{})
1728+
if err != nil {
1729+
klog.Infof("attach before reenable failed, retrying: %v", err)
1730+
}
1731+
return err == nil, nil
1732+
})
1733+
Expect(err).To(BeNil(), "Failed second attach")
1734+
defer detacher()
1735+
1736+
klog.Infof("Re-enabling udev")
1737+
err = instance.EnableUdev()
1738+
Expect(err).To(BeNil(), "Failed to enable udev")
1739+
1740+
// After udev is enabled we expect everything to succeed on the first try.
1741+
1742+
klog.Infof("Testing attach & detach with re-enabled udev")
1743+
err = testAttachWriteReadDetach(vol2ID, vol2Name, instance, client, false /* readOnly */, false /* detachAndReattach */, false /* setupDataCache */)
1744+
Expect(err).To(BeNil(), "Failed to go through nested volume lifecycle with enabled")
1745+
1746+
klog.Infof("Testing detach with re-enabled udev")
1747+
err = client.NodeUnpublishVolume(volID, args.publishDir)
1748+
Expect(err).To(BeNil(), "Failed to unpublish first")
1749+
1750+
err = client.NodeUnstageVolume(volID, args.stageDir)
1751+
Expect(err).To(BeNil(), "Failed to unstage first")
1752+
1753+
// Go through complete lifecycle again, with udev enabled.
1754+
klog.Infof("Testing final lifecycle enabled udev")
1755+
err = testAttachWriteReadDetach(volID, volName, instance, client, false /* readOnly */, false /* detachAndReattach */, false /* setupDataCache */)
1756+
Expect(err).To(BeNil(), "Failed to go through volume lifecycle with udev enabled")
1757+
})
1758+
16711759
type multiZoneTestConfig struct {
16721760
diskType string
16731761
readOnly bool

test/remote/instance.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ func machineTypeMismatch(curInst *compute.Instance, newInst *compute.Instance) b
9292
// Ideally we could compare to see if the new instance has a greater minCpuPlatfor
9393
// For now we just check it was set and it's different.
9494
if curInst.MinCpuPlatform != "" && curInst.MinCpuPlatform != newInst.MinCpuPlatform {
95-
klog.Infof("CPU Platform mismatch")
95+
klog.Infof("CPU Platform mismatch: cur: %v; new: %v", curInst.MinCpuPlatform, newInst.MinCpuPlatform)
9696
return true
9797
}
9898
if (curInst.ConfidentialInstanceConfig != nil && newInst.ConfidentialInstanceConfig == nil) ||
@@ -102,7 +102,7 @@ func machineTypeMismatch(curInst *compute.Instance, newInst *compute.Instance) b
102102
return true
103103
}
104104
if curInst.SourceMachineImage != newInst.SourceMachineImage {
105-
klog.Infof("Source Machine Mismatch")
105+
klog.Infof("Source Machine Mismatch: cur: %v; new: %v", curInst.SourceMachineImage, newInst.SourceMachineImage)
106106
return true
107107
}
108108
return false
@@ -131,7 +131,8 @@ func (i *InstanceInfo) CreateOrGetInstance(localSSDCount int) error {
131131
Type: "ONE_TO_ONE_NAT",
132132
Name: "External NAT",
133133
},
134-
}},
134+
},
135+
},
135136
},
136137
Disks: []*compute.AttachedDisk{
137138
{

test/remote/ssh.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,48 @@ func (i *InstanceInfo) SSHCheckAlive() error {
100100
})
101101
}
102102

103+
func (i *InstanceInfo) DisableUdev() error {
104+
return wait.Poll(5*time.Second, time.Minute, func() (bool, error) {
105+
_, err := i.SSH("systemctl", "stop", "systemd-udevd")
106+
if err != nil {
107+
klog.V(2).Infof("(will retry) failed to stop systemd-udevd: %v", err)
108+
return false, nil
109+
}
110+
_, err = i.SSH("systemctl", "stop", "systemd-udevd-kernel.socket")
111+
if err != nil {
112+
klog.V(2).Infof("(will retry) failed to stop systemd-udevd-kernel.socket: %v", err)
113+
return false, nil
114+
}
115+
_, err = i.SSH("systemctl", "stop", "systemd-udevd-control.socket")
116+
if err != nil {
117+
klog.V(2).Infof("(will retry) failed to stop systemd-udevd-control.socket: %v", err)
118+
return false, nil
119+
}
120+
return true, nil
121+
})
122+
}
123+
124+
func (i *InstanceInfo) EnableUdev() error {
125+
return wait.Poll(5*time.Second, time.Minute, func() (bool, error) {
126+
_, err := i.SSH("systemctl", "start", "systemd-udevd")
127+
if err != nil {
128+
klog.V(2).Infof("(will retry) failed to start systemd-udevd: %v", err)
129+
return false, nil
130+
}
131+
_, err = i.SSH("systemctl", "start", "systemd-udevd-kernel.socket")
132+
if err != nil {
133+
klog.V(2).Infof("(will retry) failed to start systemd-udevd-kernel.socket: %v", err)
134+
return false, nil
135+
}
136+
_, err = i.SSH("systemctl", "start", "systemd-udevd-control.socket")
137+
if err != nil {
138+
klog.V(2).Infof("(will retry) failed to start systemd-udevd-control.socket: %v", err)
139+
return false, nil
140+
}
141+
return true, nil
142+
})
143+
}
144+
103145
// runSSHCommand executes the ssh or scp command, adding the flag provided --ssh-options
104146
func runSSHCommand(cmd string, args ...string) (string, error) {
105147
if pk, ok := os.LookupEnv("JENKINS_GCE_SSH_PRIVATE_KEY_FILE"); ok {

0 commit comments

Comments
 (0)