Skip to content

Commit 7ebe415

Browse files
committed
add gpu checker
1 parent 440fac2 commit 7ebe415

File tree

4 files changed

+445
-32
lines changed

4 files changed

+445
-32
lines changed

.github/workflows/e2e_tests.yaml

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,99 @@ jobs:
7070
- name: Install NVidia GPU operator for KinD
7171
uses: ./common/github-actions/nvidia-gpu-operator
7272

73+
- name: Verify GPU availability in KinD
74+
run: |
75+
echo "Checking for available GPUs in the KinD cluster..."
76+
77+
# Wait for GPU operator pods to be ready (with timeout)
78+
echo "Waiting for GPU operator pods to be ready..."
79+
TIMEOUT=300 # 5 minutes timeout
80+
END=$((SECONDS + TIMEOUT))
81+
82+
while [ $SECONDS -lt $END ]; do
83+
# Get total number of pods in the namespace
84+
TOTAL_PODS=$(kubectl get pods -n gpu-operator --no-headers | wc -l)
85+
86+
# Count pods that are either running and ready or completed successfully
87+
# Exclude pods that are still initializing
88+
READY_PODS=$(kubectl get pods -n gpu-operator --no-headers | grep -E 'Running|Completed' | grep -v 'PodInitializing' | wc -l)
89+
90+
if [ "$READY_PODS" -eq "$TOTAL_PODS" ] && [ "$TOTAL_PODS" -gt 0 ]; then
91+
echo "All GPU operator pods are ready or completed successfully!"
92+
break
93+
fi
94+
95+
echo "Waiting for GPU operator pods to be ready... ($READY_PODS/$TOTAL_PODS)"
96+
echo "Pod status:"
97+
kubectl get pods -n gpu-operator
98+
sleep 10
99+
done
100+
101+
if [ $SECONDS -ge $END ]; then
102+
echo "::error::Timeout waiting for GPU operator pods to be ready"
103+
echo "GPU operator pod status:"
104+
kubectl get pods -n gpu-operator -o wide
105+
echo "GPU operator pod logs:"
106+
kubectl logs -n gpu-operator -l app.kubernetes.io/name=gpu-operator
107+
echo "GPU operator pod events:"
108+
kubectl get events -n gpu-operator
109+
exit 1
110+
fi
111+
112+
echo "Node details:"
113+
kubectl describe nodes | grep -E 'nvidia.com/gpu|Allocatable:|Capacity:|Name:'
114+
115+
# Check if GPU operator has labeled nodes
116+
GPU_LABELS=$(kubectl describe nodes | grep -c "nvidia.com/gpu")
117+
if [ "$GPU_LABELS" -eq 0 ]; then
118+
echo "::error::No NVIDIA GPU labels found on nodes. GPU operator may not be running correctly."
119+
echo "Full node descriptions for debugging:"
120+
kubectl describe nodes
121+
exit 1
122+
fi
123+
124+
# Check if GPUs are actually allocatable
125+
GPU_ALLOCATABLE=$(kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}' | tr ' ' '\n' | grep -v '^$' | wc -l)
126+
if [ "$GPU_ALLOCATABLE" -eq 0 ]; then
127+
echo "::error::GPU operator is running but no GPUs are allocatable. Check GPU operator logs."
128+
echo "Checking GPU operator pods:"
129+
kubectl get pods -n gpu-operator -o wide
130+
echo "GPU operator pod logs:"
131+
kubectl logs -n gpu-operator -l app.kubernetes.io/name=gpu-operator
132+
echo "GPU operator pod events:"
133+
kubectl get events -n gpu-operator
134+
echo "GPU operator pod descriptions:"
135+
kubectl describe pods -n gpu-operator
136+
exit 1
137+
fi
138+
139+
echo "Successfully found $GPU_ALLOCATABLE allocatable GPU(s) in the cluster."
140+
73141
- name: Deploy CodeFlare stack
74142
id: deploy
75143
run: |
76144
cd codeflare-operator
77145
echo Setting up CodeFlare stack
78146
make setup-e2e
147+
148+
# Create ConfigMap to disable mTLS
149+
echo "Creating ConfigMap to disable mTLS..."
150+
cat <<EOF | kubectl apply -f -
151+
apiVersion: v1
152+
kind: ConfigMap
153+
metadata:
154+
name: codeflare-operator-config
155+
namespace: ray-system
156+
data:
157+
config.yaml: |
158+
kuberay:
159+
mTLSEnabled: false
160+
rayDashboardOAuthEnabled: false
161+
ingressDomain: "kind"
162+
appwrapper:
163+
enabled: true
164+
EOF
165+
79166
echo Deploying CodeFlare operator
80167
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
81168
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager

codeflare-kuberay.code-workspace

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"folders": [
3+
{
4+
"path": "/Users/bkeane/Code/github.com/codeflare-sdk"
5+
},
6+
{
7+
"path": "/Users/bkeane/Code/github.com/kuberay"
8+
},
9+
{
10+
"path": "/Users/bkeane/Code/github.com/codeflare-operator"
11+
}
12+
]
13+
}

tests/e2e/local_interactive_sdk_kind_test.py

Lines changed: 92 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import logging
1212
import time
1313
import os
14+
import subprocess
1415

1516
from support import *
1617

@@ -54,6 +55,10 @@ def run_local_interactives(
5455
):
5556
cluster_name = "test-ray-cluster-li"
5657
logger.info(f"Starting run_local_interactives with {number_of_gpus} GPUs")
58+
59+
logger.info("Cleaning up existing Ray connections...")
60+
ray.shutdown()
61+
logger.info("Ray connection cleanup completed")
5762

5863
logger.info("Creating cluster configuration...")
5964
cluster = Cluster(
@@ -66,7 +71,7 @@ def run_local_interactives(
6671
head_memory_requests=2,
6772
head_memory_limits=2,
6873
worker_cpu_requests="500m",
69-
worker_cpu_limits=1,
74+
worker_cpu_limits="500m",
7075
worker_memory_requests=1,
7176
worker_memory_limits=4,
7277
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
@@ -82,8 +87,93 @@ def run_local_interactives(
8287

8388
logger.info("Waiting for cluster to be ready...")
8489
cluster.wait_ready()
90+
cluster.status()
8591
logger.info("Cluster is ready")
8692

93+
# Wait for pods to be fully ready
94+
logger.info("Waiting for pods to be fully ready...")
95+
TIMEOUT = 300 # 5 minutes timeout
96+
END = time.time() + TIMEOUT
97+
98+
head_pod_name = None
99+
worker_pod_name = None
100+
101+
while time.time() < END:
102+
# Dynamically find pod names using substrings
103+
if not head_pod_name:
104+
head_pod_name = kubectl_get_pod_name_by_substring(self.namespace, cluster_name, "head")
105+
if head_pod_name:
106+
logger.info(f"Discovered head pod by substring: {head_pod_name}")
107+
else:
108+
logger.info(f"Head pod not yet found by searching for '{cluster_name}' and 'head' in pod names. Retrying...")
109+
110+
if not worker_pod_name:
111+
worker_pod_name = kubectl_get_pod_name_by_substring(self.namespace, cluster_name, "worker")
112+
if worker_pod_name:
113+
logger.info(f"Discovered worker pod by substring: {worker_pod_name}")
114+
else:
115+
logger.info(f"Worker pod not yet found by searching for '{cluster_name}' and 'worker' in pod names. Retrying...")
116+
117+
head_status = "NotFound"
118+
worker_status = "NotFound"
119+
120+
if head_pod_name:
121+
head_status = kubectl_get_pod_status(self.namespace, head_pod_name)
122+
if worker_pod_name:
123+
worker_status = kubectl_get_pod_status(self.namespace, worker_pod_name)
124+
125+
logger.info(f"Head pod ({head_pod_name or 'N/A'}) status: {head_status}")
126+
logger.info(f"Worker pod ({worker_pod_name or 'N/A'}) status: {worker_status}")
127+
128+
if head_pod_name and worker_pod_name and "Running" in head_status and "Running" in worker_status:
129+
head_ready = kubectl_get_pod_ready(self.namespace, head_pod_name)
130+
worker_ready = kubectl_get_pod_ready(self.namespace, worker_pod_name)
131+
132+
if head_ready and worker_ready:
133+
logger.info("All discovered pods and containers are ready!")
134+
break
135+
else:
136+
logger.info("Discovered pods are running but containers are not all ready yet...")
137+
if not head_ready and head_pod_name:
138+
head_container_status = kubectl_get_pod_container_status(self.namespace, head_pod_name)
139+
logger.info(f"Head pod ({head_pod_name}) container status: {head_container_status}")
140+
if not worker_ready and worker_pod_name:
141+
worker_container_status = kubectl_get_pod_container_status(self.namespace, worker_pod_name)
142+
logger.info(f"Worker pod ({worker_pod_name}) container status: {worker_container_status}")
143+
elif (head_pod_name and "Error" in head_status) or \
144+
(worker_pod_name and "Error" in worker_status):
145+
logger.error("Error getting pod status for one or more pods, retrying...")
146+
else:
147+
logger.info(f"Waiting for pods to be discovered and running... Current status - Head ({head_pod_name or 'N/A'}): {head_status}, Worker ({worker_pod_name or 'N/A'}): {worker_status}")
148+
149+
time.sleep(10)
150+
151+
if time.time() >= END:
152+
logger.error("Timeout waiting for pods to be ready or discovered")
153+
if not head_pod_name or not worker_pod_name:
154+
logger.error("Could not discover head and/or worker pods by name substring. Listing all pods in namespace for debugging:")
155+
try:
156+
all_pods_result = subprocess.run(
157+
["kubectl", "get", "pods", "-n", self.namespace, "-o", "wide"],
158+
capture_output=True, text=True, check=False
159+
)
160+
logger.error(f"Pods in namespace '{self.namespace}':\n{all_pods_result.stdout}")
161+
if all_pods_result.stderr:
162+
logger.error(f"Error listing pods: {all_pods_result.stderr}")
163+
except Exception as e_pods:
164+
logger.error(f"Exception while trying to list all pods: {e_pods}")
165+
166+
if head_pod_name:
167+
logger.error(f"Final head pod ({head_pod_name}) status: {kubectl_get_pod_container_status(self.namespace, head_pod_name)}")
168+
else:
169+
logger.error(f"Final head pod status: Not Discovered by searching for '{cluster_name}' and 'head' in pod names.")
170+
171+
if worker_pod_name:
172+
logger.error(f"Final worker pod ({worker_pod_name}) status: {kubectl_get_pod_container_status(self.namespace, worker_pod_name)}")
173+
else:
174+
logger.error(f"Final worker pod status: Not Discovered by searching for '{cluster_name}' and 'worker' in pod names.")
175+
raise TimeoutError("Pods did not become ready (or were not discovered by name substring) within the timeout period")
176+
87177
logger.info("Generating TLS certificates...")
88178
generate_cert.generate_tls_cert(cluster_name, self.namespace)
89179
logger.info("TLS certificates generated")
@@ -107,13 +197,9 @@ def run_local_interactives(
107197
cluster_uri = cluster.cluster_uri()
108198
logger.info(f"Cluster URI: {cluster_uri}")
109199

110-
logger.info("Shutting down any existing Ray connections...")
111-
ray.shutdown()
112-
logger.info("Ray shutdown completed")
113-
114200
logger.info("Initializing Ray connection...")
115201
try:
116-
ray.init(address=client_url, logging_level="DEBUG")
202+
ray.init(address=client_url, logging_level="INFO")
117203
logger.info("Ray initialization successful")
118204
except Exception as e:
119205
logger.error(f"Ray initialization failed: {str(e)}")

0 commit comments

Comments
 (0)