Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 31 additions & 12 deletions pkg/deviceutils/device-utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,10 @@ func (m *deviceUtils) VerifyDevicePath(devicePaths []string, deviceName string)
})

if err != nil {
return "", fmt.Errorf("failed to find and re-link disk %s with udevadm after retrying for %v: %w", deviceName, pollTimeout, err)
klog.Warningf("For device %s udevadmin failed: %v. Trying to manually link", deviceName, err)
if err := manuallySetDevicePath(deviceName); err != nil {
return "", fmt.Errorf("failed to manually set link for disk %s: %w", deviceName, err)
}
}

return devicePath, nil
Expand Down Expand Up @@ -338,11 +341,11 @@ func findAvailableDevFsPaths() ([]string, error) {
return append(diskSDPaths, diskNvmePaths...), nil
}

func udevadmTriggerForDiskIfExists(deviceName string) error {
func findDevice(deviceName string) (string, string, error) {
devFsPathToSerial := map[string]string{}
devFsPaths, err := findAvailableDevFsPaths()
if err != nil {
return err
return "", "", err
}
for _, devFsPath := range devFsPaths {
devFsSerial, err := getDevFsSerial(devFsPath)
Expand All @@ -355,17 +358,33 @@ func udevadmTriggerForDiskIfExists(deviceName string) error {
klog.V(4).Infof("device path %s, serial number %v", devFsPath, devFsSerial)
devFsPathToSerial[devFsPath] = devFsSerial
if devFsSerial == deviceName {
// Found the disk that we're looking for so run a trigger on it
// to resolve its /dev/by-id/ path
klog.Warningf("udevadm --trigger running to fix disk at path %s which has serial number %s", devFsPath, devFsSerial)
err := udevadmChangeToDrive(devFsPath)
if err != nil {
return fmt.Errorf("udevadm --trigger failed to fix device path %s which has serial number %s: %w", devFsPath, devFsSerial, err)
}
return nil
return devFsPath, devFsSerial, nil
}
}
return fmt.Errorf("udevadm --trigger requested to fix disk %s but no such disk was found in device path %v", deviceName, devFsPathToSerial)
return "", "", fmt.Errorf("udevadm --trigger requested to fix disk %s but no such disk was found in device path %v", deviceName, devFsPathToSerial)
}

func manuallySetDevicePath(deviceName string) error {
devFsPath, devFsSerial, err := findDevice(deviceName)
if err != nil {
return err
}
return os.Symlink(devFsPath, path.Join(diskByIdPath, diskGooglePrefix+devFsSerial))
}

func udevadmTriggerForDiskIfExists(deviceName string) error {
devFsPath, devFsSerial, err := findDevice(deviceName)
if err != nil {
return err
}
// Found the disk that we're looking for so run a trigger on it
// to resolve its /dev/by-id/ path
klog.Warningf("udevadm --trigger running to fix disk at path %s which has serial number %s", devFsPath, devFsSerial)
err = udevadmChangeToDrive(devFsPath)
if err != nil {
return fmt.Errorf("udevadm --trigger failed to fix device path %s which has serial number %s: %w", devFsPath, devFsSerial, err)
}
return nil
}

// Calls "udevadm trigger --action=change" on the specified drive. drivePath
Expand Down
4 changes: 2 additions & 2 deletions test/e2e/tests/setup_e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ var (
serviceAccount = flag.String("service-account", "", "Service account to bring up instance with")
vmNamePrefix = flag.String("vm-name-prefix", "gce-pd-csi-e2e", "VM name prefix")
architecture = flag.String("arch", "amd64", "Architecture pd csi driver build on")
minCpuPlatform = flag.String("min-cpu-platform", "rome", "Minimum CPU architecture")
mwMinCpuPlatform = flag.String("min-cpu-platform-mw", "sapphirerapids", "Minimum CPU architecture for multiwriter tests")
minCpuPlatform = flag.String("min-cpu-platform", "AMD Rome", "Minimum CPU architecture")
mwMinCpuPlatform = flag.String("min-cpu-platform-mw", "Intel Sapphire Rapids", "Minimum CPU architecture for multiwriter tests")
zones = flag.String("zones", "us-east4-a,us-east4-c", "Zones to run tests in. If there are multiple zones, separate each by comma")
machineType = flag.String("machine-type", "n2d-standard-4", "Type of machine to provision instance on")
imageURL = flag.String("image-url", "projects/ubuntu-os-cloud/global/images/family/ubuntu-minimal-2404-lts-amd64", "OS image url to get image from")
Expand Down
88 changes: 88 additions & 0 deletions test/e2e/tests/single_zone_e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1668,6 +1668,94 @@ var _ = Describe("GCE PD CSI Driver", func() {
Expect(err).To(BeNil(), "Failed to rm file path %s: %v", fp, err)
})

It("Should mount if udev disabled, and remount if it's enabled again", func() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we verify a few additional scenarios?

  1. Detaching a disk with a manually created symlink cleans up the links
  2. After re-enabling udev, could it override the manually created link?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think those should be covered already in the scenario below?

testContext := getRandomTestContext()
p, z, _ := testContext.Instance.GetIdentity()
client := testContext.Client
instance := testContext.Instance

klog.Infof("Disabling udev")
err := instance.DisableUdev()
Expect(err).To(BeNil(), "Failed to disable udev")

// Create Disk
volName, volID := createAndValidateUniqueZonalDisk(client, p, z, standardDiskType)
vol2Name, vol2ID := createAndValidateUniqueZonalDisk(client, p, z, standardDiskType)

defer func() {
// Delete Disks
err := client.DeleteVolume(volID)
Expect(err).To(BeNil(), "DeleteVolume failed")

err = client.DeleteVolume(vol2ID)
Expect(err).To(BeNil(), "DeleteVolume failed")

// Validate Disks Deleted
_, err = computeService.Disks.Get(p, z, volName).Do()
Expect(gce.IsGCEError(err, "notFound")).To(BeTrue(), "Expected disk to not be found")
_, err = computeService.Disks.Get(p, z, vol2Name).Do()
Expect(gce.IsGCEError(err, "notFound")).To(BeTrue(), "Expected disk to not be found")
}()

// Attach & detach disk. We retry as we expect the udev repair to take a little bit of time.
klog.Infof("Starting attach & detach with disabled udev")
err = wait.Poll(10*time.Second, 5*time.Minute, func() (bool, error) {
err = testAttachWriteReadDetach(volID, volName, instance, client, false /* readOnly */, false /* detachAndReattach */, false /* setupDataCache */)
if err != nil {
klog.Infof("Initial udev error, retrying: %v", err)
}
return err == nil, nil
})
Expect(err).To(BeNil(), "Failed to go through volume lifecycle")

// Attach a different disk. The conflicting udev paths should not cause a problem.
klog.Infof("Starting second attach & detach with disabled udev")
err = wait.Poll(10*time.Second, 5*time.Minute, func() (bool, error) {
err = testAttachWriteReadDetach(vol2ID, vol2Name, instance, client, false /* readOnly */, false /* detachAndReattach */, false /* setupDataCache */)
if err != nil {
klog.Infof("second disk udev error, retrying: %v", err)
}
return err == nil, nil
})
Expect(err).To(BeNil(), "Failed to go through second volume lifecycle")

// Attach, reenable udev, go through lifecycle of second disk, detach first
klog.Infof("Starting attach & udev re-enable")
var detacher func()
var args *verifyArgs
err = wait.Poll(10*time.Second, 5*time.Minute, func() (bool, error) {
err, detacher, args = testAttachAndMount(volID, volName, instance, client, attachAndMountArgs{})
if err != nil {
klog.Infof("attach before reenable failed, retrying: %v", err)
}
return err == nil, nil
})
Expect(err).To(BeNil(), "Failed second attach")
defer detacher()

klog.Infof("Re-enabling udev")
err = instance.EnableUdev()
Expect(err).To(BeNil(), "Failed to enable udev")

// After udev is enabled we expect everything to succeed on the first try.

klog.Infof("Testing attach & detach with re-enabled udev")
err = testAttachWriteReadDetach(vol2ID, vol2Name, instance, client, false /* readOnly */, false /* detachAndReattach */, false /* setupDataCache */)
Expect(err).To(BeNil(), "Failed to go through nested volume lifecycle with enabled")

klog.Infof("Testing detach with re-enabled udev")
err = client.NodeUnpublishVolume(volID, args.publishDir)
Expect(err).To(BeNil(), "Failed to unpublish first")

err = client.NodeUnstageVolume(volID, args.stageDir)
Expect(err).To(BeNil(), "Failed to unstage first")

// Go through complete lifecycle again, with udev enabled.
klog.Infof("Testing final lifecycle enabled udev")
err = testAttachWriteReadDetach(volID, volName, instance, client, false /* readOnly */, false /* detachAndReattach */, false /* setupDataCache */)
Expect(err).To(BeNil(), "Failed to go through volume lifecycle with udev enabled")
})

type multiZoneTestConfig struct {
diskType string
readOnly bool
Expand Down
7 changes: 4 additions & 3 deletions test/remote/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ func machineTypeMismatch(curInst *compute.Instance, newInst *compute.Instance) b
// Ideally we could compare to see if the new instance has a greater minCpuPlatfor
// For now we just check it was set and it's different.
if curInst.MinCpuPlatform != "" && curInst.MinCpuPlatform != newInst.MinCpuPlatform {
klog.Infof("CPU Platform mismatch")
klog.Infof("CPU Platform mismatch: cur: %v; new: %v", curInst.MinCpuPlatform, newInst.MinCpuPlatform)
return true
}
if (curInst.ConfidentialInstanceConfig != nil && newInst.ConfidentialInstanceConfig == nil) ||
Expand All @@ -102,7 +102,7 @@ func machineTypeMismatch(curInst *compute.Instance, newInst *compute.Instance) b
return true
}
if curInst.SourceMachineImage != newInst.SourceMachineImage {
klog.Infof("Source Machine Mismatch")
klog.Infof("Source Machine Mismatch: cur: %v; new: %v", curInst.SourceMachineImage, newInst.SourceMachineImage)
return true
}
return false
Expand Down Expand Up @@ -131,7 +131,8 @@ func (i *InstanceInfo) CreateOrGetInstance(localSSDCount int) error {
Type: "ONE_TO_ONE_NAT",
Name: "External NAT",
},
}},
},
},
},
Disks: []*compute.AttachedDisk{
{
Expand Down
42 changes: 42 additions & 0 deletions test/remote/ssh.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,48 @@ func (i *InstanceInfo) SSHCheckAlive() error {
})
}

func (i *InstanceInfo) DisableUdev() error {
return wait.Poll(5*time.Second, time.Minute, func() (bool, error) {
_, err := i.SSH("systemctl", "stop", "systemd-udevd")
if err != nil {
klog.V(2).Infof("(will retry) failed to stop systemd-udevd: %v", err)
return false, nil
}
_, err = i.SSH("systemctl", "stop", "systemd-udevd-kernel.socket")
if err != nil {
klog.V(2).Infof("(will retry) failed to stop systemd-udevd-kernel.socket: %v", err)
return false, nil
}
_, err = i.SSH("systemctl", "stop", "systemd-udevd-control.socket")
if err != nil {
klog.V(2).Infof("(will retry) failed to stop systemd-udevd-control.socket: %v", err)
return false, nil
}
return true, nil
})
}

func (i *InstanceInfo) EnableUdev() error {
return wait.Poll(5*time.Second, time.Minute, func() (bool, error) {
_, err := i.SSH("systemctl", "start", "systemd-udevd")
if err != nil {
klog.V(2).Infof("(will retry) failed to start systemd-udevd: %v", err)
return false, nil
}
_, err = i.SSH("systemctl", "start", "systemd-udevd-kernel.socket")
if err != nil {
klog.V(2).Infof("(will retry) failed to start systemd-udevd-kernel.socket: %v", err)
return false, nil
}
_, err = i.SSH("systemctl", "start", "systemd-udevd-control.socket")
if err != nil {
klog.V(2).Infof("(will retry) failed to start systemd-udevd-control.socket: %v", err)
return false, nil
}
return true, nil
})
}

// runSSHCommand executes the ssh or scp command, adding the flag provided --ssh-options
func runSSHCommand(cmd string, args ...string) (string, error) {
if pk, ok := os.LookupEnv("JENKINS_GCE_SSH_PRIVATE_KEY_FILE"); ok {
Expand Down