Skip to content

Commit 0313d88

Browse files
Refactor e2e test infrastructure with unified helpers leveraging GPU operator clientsets
Signed-off-by: Karthik Vetrivel <[email protected]>
1 parent 70b458d commit 0313d88

File tree

10 files changed

+841
-165
lines changed

10 files changed

+841
-165
lines changed

tests/e2e/gpu_operator_test.go

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,7 @@ import (
2828

2929
"github.com/NVIDIA/gpu-operator/tests/e2e/framework"
3030
e2elog "github.com/NVIDIA/gpu-operator/tests/e2e/framework/logs"
31-
k8stest "github.com/NVIDIA/gpu-operator/tests/e2e/kubernetes"
32-
"github.com/NVIDIA/gpu-operator/tests/e2e/operator"
31+
"github.com/NVIDIA/gpu-operator/tests/e2e/helpers"
3332
)
3433

3534
var _ = Describe(e2eTestPrefix+"-premerge-suite", func() {
@@ -44,15 +43,15 @@ var _ = Describe(e2eTestPrefix+"-premerge-suite", func() {
4443

4544
// Init global suite vars vars
4645
var (
47-
operatorClient *operator.Client
46+
operatorClient *helpers.OperatorClient
4847
helmReleaseName string
49-
k8sClient *k8stest.Client
48+
k8sClient *helpers.PodClient
5049
testNamespace *corev1.Namespace
5150
)
5251

5352
BeforeAll(func(ctx context.Context) {
5453
var err error
55-
k8sClient = k8stest.NewClient(f.ClientSet.CoreV1())
54+
k8sClient = helpers.NewPodClient(f.ClientSet.CoreV1())
5655
nsLabels := map[string]string{
5756
"e2e-run": string(framework.RunID),
5857
}
@@ -62,10 +61,10 @@ var _ = Describe(e2eTestPrefix+"-premerge-suite", func() {
6261
Fail(fmt.Sprintf("failed to create gpu operator namespace %s: %v", tcfg.namespace, err))
6362
}
6463

65-
operatorClient, err = operator.NewClient(
66-
operator.WithNamespace(testNamespace.Name),
67-
operator.WithKubeConfig(framework.TestContext.KubeConfig),
68-
operator.WithChart(tcfg.helmChart),
64+
operatorClient, err = helpers.NewOperatorClient(
65+
helpers.WithNamespace(testNamespace.Name),
66+
helpers.WithKubeConfig(framework.TestContext.KubeConfig),
67+
helpers.WithChart(tcfg.helmChart),
6968
)
7069
if err != nil {
7170
Fail(fmt.Sprintf("failed to instantiate gpu operator client: %v", err))
@@ -79,7 +78,7 @@ var _ = Describe(e2eTestPrefix+"-premerge-suite", func() {
7978
fmt.Sprintf("validator.image=%s", tcfg.validatorImage),
8079
fmt.Sprintf("validator.version=%s", tcfg.validatorVersion),
8180
}
82-
helmReleaseName, err = operatorClient.Install(ctx, values, operator.ChartOptions{
81+
helmReleaseName, err = operatorClient.Install(ctx, values, helpers.ChartOptions{
8382
CleanupOnFail: true,
8483
GenerateName: true,
8584
Timeout: 5 * time.Minute,
@@ -157,8 +156,12 @@ var _ = Describe(e2eTestPrefix+"-premerge-suite", func() {
157156
hasRestarts, err := k8sClient.EnsureNoPodRestarts(ctx, pod.Name, pod.Namespace)
158157
Expect(err).NotTo(HaveOccurred())
159158
if !hasRestarts {
160-
errLogs := k8sClient.GetPodLogs(ctx, pod)
161-
e2elog.Logf("printing logs from the pod %s/%s: %s", pod.Namespace, pod.Name, errLogs)
159+
errLogs, err := k8sClient.GetPodLogs(ctx, pod)
160+
if err != nil {
161+
e2elog.Logf("WARN: failed to retrieve logs from pod %s/%s: %v", pod.Namespace, pod.Name, err)
162+
} else {
163+
e2elog.Logf("printing logs from the pod %s/%s: %s", pod.Namespace, pod.Name, errLogs)
164+
}
162165
e2elog.Failf("pod %s/%s has unexpected restarts", pod.Namespace, pod.Name)
163166
}
164167
}

tests/e2e/helpers/clusterpolicy.go

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
/**
2+
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
**/
16+
17+
package helpers
18+
19+
import (
20+
"context"
21+
"time"
22+
23+
nvidiav1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
24+
gpuclientset "github.com/NVIDIA/gpu-operator/api/versioned"
25+
"github.com/NVIDIA/gpu-operator/internal/conditions"
26+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27+
"k8s.io/apimachinery/pkg/util/wait"
28+
"k8s.io/client-go/util/retry"
29+
"k8s.io/utils/ptr"
30+
)
31+
32+
type ClusterPolicyHelper struct {
33+
client gpuclientset.Interface
34+
}
35+
36+
func NewClusterPolicyHelper(client gpuclientset.Interface) *ClusterPolicyHelper {
37+
return &ClusterPolicyHelper{
38+
client: client,
39+
}
40+
}
41+
42+
func (h *ClusterPolicyHelper) Get(ctx context.Context, name string) (*nvidiav1.ClusterPolicy, error) {
43+
return h.client.NvidiaV1().ClusterPolicies().Get(ctx, name, metav1.GetOptions{})
44+
}
45+
46+
func (h *ClusterPolicyHelper) Update(ctx context.Context, cp *nvidiav1.ClusterPolicy) (*nvidiav1.ClusterPolicy, error) {
47+
return h.client.NvidiaV1().ClusterPolicies().Update(ctx, cp, metav1.UpdateOptions{})
48+
}
49+
50+
// modify applies a mutation function to a ClusterPolicy and persists the changes.
51+
// It uses RetryOnConflict to handle concurrent modifications by the operator controller.
52+
func (h *ClusterPolicyHelper) modify(ctx context.Context, name string, mutate func(*nvidiav1.ClusterPolicy)) error {
53+
return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
54+
clusterPolicy, err := h.Get(ctx, name)
55+
if err != nil {
56+
return err
57+
}
58+
59+
mutate(clusterPolicy)
60+
61+
_, err = h.Update(ctx, clusterPolicy)
62+
return err
63+
})
64+
}
65+
66+
func (h *ClusterPolicyHelper) UpdateDriverVersion(ctx context.Context, name, version string) error {
67+
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
68+
clusterPolicy.Spec.Driver.Version = version
69+
})
70+
}
71+
72+
func (h *ClusterPolicyHelper) EnableDCGM(ctx context.Context, name string) error {
73+
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
74+
clusterPolicy.Spec.DCGM.Enabled = ptr.To(true)
75+
})
76+
}
77+
78+
func (h *ClusterPolicyHelper) DisableDCGM(ctx context.Context, name string) error {
79+
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
80+
clusterPolicy.Spec.DCGM.Enabled = ptr.To(false)
81+
})
82+
}
83+
84+
func (h *ClusterPolicyHelper) EnableDCGMExporter(ctx context.Context, name string) error {
85+
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
86+
clusterPolicy.Spec.DCGMExporter.Enabled = ptr.To(true)
87+
})
88+
}
89+
90+
func (h *ClusterPolicyHelper) DisableDCGMExporter(ctx context.Context, name string) error {
91+
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
92+
clusterPolicy.Spec.DCGMExporter.Enabled = ptr.To(false)
93+
})
94+
}
95+
96+
func (h *ClusterPolicyHelper) EnableGFD(ctx context.Context, name string) error {
97+
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
98+
clusterPolicy.Spec.GPUFeatureDiscovery.Enabled = ptr.To(true)
99+
})
100+
}
101+
102+
func (h *ClusterPolicyHelper) DisableGFD(ctx context.Context, name string) error {
103+
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
104+
clusterPolicy.Spec.GPUFeatureDiscovery.Enabled = ptr.To(false)
105+
})
106+
}
107+
108+
func (h *ClusterPolicyHelper) SetMIGStrategy(ctx context.Context, name, strategy string) error {
109+
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
110+
clusterPolicy.Spec.MIG.Strategy = nvidiav1.MIGStrategy(strategy)
111+
})
112+
}
113+
114+
func (h *ClusterPolicyHelper) WaitForReady(ctx context.Context, name string, timeout time.Duration) error {
115+
return wait.PollUntilContextTimeout(ctx, defaultPollingInterval, timeout, true, func(ctx context.Context) (bool, error) {
116+
clusterPolicy, err := h.Get(ctx, name)
117+
if err != nil {
118+
return false, err
119+
}
120+
121+
for _, condition := range clusterPolicy.Status.Conditions {
122+
if condition.Type == conditions.Ready && condition.Status == metav1.ConditionTrue {
123+
return true, nil
124+
}
125+
}
126+
127+
return false, nil
128+
})
129+
}
130+

tests/e2e/helpers/constants.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
/**
2+
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
**/
16+
17+
package helpers
18+
19+
import "time"
20+
21+
const (
22+
// defaultPollingInterval is the default interval for polling operations
23+
defaultPollingInterval = 5 * time.Second
24+
25+
// upgradeDoneState represents the state when a driver upgrade is complete
26+
upgradeDoneState = "upgrade-done"
27+
)
28+

tests/e2e/helpers/daemonset.go

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
/**
2+
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
**/
16+
17+
package helpers
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"time"
23+
24+
appsv1 "k8s.io/api/apps/v1"
25+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
26+
"k8s.io/apimachinery/pkg/labels"
27+
"k8s.io/apimachinery/pkg/util/wait"
28+
"k8s.io/client-go/kubernetes"
29+
)
30+
31+
type DaemonSetHelper struct {
32+
client kubernetes.Interface
33+
}
34+
35+
func NewDaemonSetHelper(client kubernetes.Interface) *DaemonSetHelper {
36+
return &DaemonSetHelper{
37+
client: client,
38+
}
39+
}
40+
41+
func (h *DaemonSetHelper) GetByLabel(ctx context.Context, namespace, labelKey, labelValue string) (*appsv1.DaemonSet, error) {
42+
labelSelector := labels.SelectorFromSet(map[string]string{
43+
labelKey: labelValue,
44+
}).String()
45+
46+
daemonSetList, err := h.client.AppsV1().DaemonSets(namespace).List(ctx, metav1.ListOptions{
47+
LabelSelector: labelSelector,
48+
})
49+
if err != nil {
50+
return nil, fmt.Errorf("failed to list DaemonSets: %w", err)
51+
}
52+
53+
if len(daemonSetList.Items) == 0 {
54+
return nil, fmt.Errorf("no DaemonSet found with label %s=%s", labelKey, labelValue)
55+
}
56+
57+
if len(daemonSetList.Items) > 1 {
58+
return nil, fmt.Errorf("multiple DaemonSets found with label %s=%s", labelKey, labelValue)
59+
}
60+
61+
return &daemonSetList.Items[0], nil
62+
}
63+
64+
func (h *DaemonSetHelper) Get(ctx context.Context, namespace, name string) (*appsv1.DaemonSet, error) {
65+
return h.client.AppsV1().DaemonSets(namespace).Get(ctx, name, metav1.GetOptions{})
66+
}
67+
68+
func (h *DaemonSetHelper) WaitForReady(ctx context.Context, namespace, name string, timeout time.Duration) error {
69+
return wait.PollUntilContextTimeout(ctx, defaultPollingInterval, timeout, true, func(ctx context.Context) (bool, error) {
70+
daemonSet, err := h.Get(ctx, namespace, name)
71+
if err != nil {
72+
return false, err
73+
}
74+
75+
if daemonSet.Status.NumberReady == daemonSet.Status.DesiredNumberScheduled &&
76+
daemonSet.Status.NumberReady > 0 {
77+
return true, nil
78+
}
79+
80+
return false, nil
81+
})
82+
}
83+
84+
func (h *DaemonSetHelper) IsReady(ctx context.Context, namespace, name string) (bool, error) {
85+
daemonSet, err := h.Get(ctx, namespace, name)
86+
if err != nil {
87+
return false, err
88+
}
89+
90+
return daemonSet.Status.NumberReady == daemonSet.Status.DesiredNumberScheduled && daemonSet.Status.NumberReady > 0, nil
91+
}
92+
93+
func (h *DaemonSetHelper) GetImage(ctx context.Context, namespace, name string) (string, error) {
94+
daemonSet, err := h.Get(ctx, namespace, name)
95+
if err != nil {
96+
return "", fmt.Errorf("failed to get DaemonSet: %w", err)
97+
}
98+
99+
if len(daemonSet.Spec.Template.Spec.Containers) == 0 {
100+
return "", fmt.Errorf("DaemonSet has no containers")
101+
}
102+
103+
return daemonSet.Spec.Template.Spec.Containers[0].Image, nil
104+
}
105+
106+
func (h *DaemonSetHelper) CheckNoRestarts(ctx context.Context, namespace, name string) error {
107+
daemonSet, err := h.Get(ctx, namespace, name)
108+
if err != nil {
109+
return fmt.Errorf("failed to get DaemonSet: %w", err)
110+
}
111+
112+
labelSelector := labels.SelectorFromSet(daemonSet.Spec.Selector.MatchLabels).String()
113+
podList, err := h.client.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{
114+
LabelSelector: labelSelector,
115+
})
116+
if err != nil {
117+
return fmt.Errorf("failed to list pods: %w", err)
118+
}
119+
120+
for _, pod := range podList.Items {
121+
for _, containerStatus := range pod.Status.ContainerStatuses {
122+
if containerStatus.RestartCount > 0 {
123+
return fmt.Errorf("pod %s/%s container %s has %d restarts",
124+
pod.Namespace, pod.Name, containerStatus.Name, containerStatus.RestartCount)
125+
}
126+
}
127+
}
128+
129+
return nil
130+
}
131+

0 commit comments

Comments
 (0)