11
11
import logging
12
12
import time
13
13
import os
14
+ import subprocess
14
15
15
16
from support import *
16
17
@@ -54,6 +55,10 @@ def run_local_interactives(
54
55
):
55
56
cluster_name = "test-ray-cluster-li"
56
57
logger .info (f"Starting run_local_interactives with { number_of_gpus } GPUs" )
58
+
59
+ logger .info ("Cleaning up existing Ray connections..." )
60
+ ray .shutdown ()
61
+ logger .info ("Ray connection cleanup completed" )
57
62
58
63
logger .info ("Creating cluster configuration..." )
59
64
cluster = Cluster (
@@ -66,7 +71,7 @@ def run_local_interactives(
66
71
head_memory_requests = 2 ,
67
72
head_memory_limits = 2 ,
68
73
worker_cpu_requests = "500m" ,
69
- worker_cpu_limits = 1 ,
74
+ worker_cpu_limits = "500m" ,
70
75
worker_memory_requests = 1 ,
71
76
worker_memory_limits = 4 ,
72
77
worker_extended_resource_requests = {gpu_resource_name : number_of_gpus },
@@ -82,8 +87,93 @@ def run_local_interactives(
82
87
83
88
logger .info ("Waiting for cluster to be ready..." )
84
89
cluster .wait_ready ()
90
+ cluster .status ()
85
91
logger .info ("Cluster is ready" )
86
92
93
+ # Wait for pods to be fully ready
94
+ logger .info ("Waiting for pods to be fully ready..." )
95
+ TIMEOUT = 300 # 5 minutes timeout
96
+ END = time .time () + TIMEOUT
97
+
98
+ head_pod_name = None
99
+ worker_pod_name = None
100
+
101
+ while time .time () < END :
102
+ # Dynamically find pod names using substrings
103
+ if not head_pod_name :
104
+ head_pod_name = kubectl_get_pod_name_by_substring (self .namespace , cluster_name , "head" )
105
+ if head_pod_name :
106
+ logger .info (f"Discovered head pod by substring: { head_pod_name } " )
107
+ else :
108
+ logger .info (f"Head pod not yet found by searching for '{ cluster_name } ' and 'head' in pod names. Retrying..." )
109
+
110
+ if not worker_pod_name :
111
+ worker_pod_name = kubectl_get_pod_name_by_substring (self .namespace , cluster_name , "worker" )
112
+ if worker_pod_name :
113
+ logger .info (f"Discovered worker pod by substring: { worker_pod_name } " )
114
+ else :
115
+ logger .info (f"Worker pod not yet found by searching for '{ cluster_name } ' and 'worker' in pod names. Retrying..." )
116
+
117
+ head_status = "NotFound"
118
+ worker_status = "NotFound"
119
+
120
+ if head_pod_name :
121
+ head_status = kubectl_get_pod_status (self .namespace , head_pod_name )
122
+ if worker_pod_name :
123
+ worker_status = kubectl_get_pod_status (self .namespace , worker_pod_name )
124
+
125
+ logger .info (f"Head pod ({ head_pod_name or 'N/A' } ) status: { head_status } " )
126
+ logger .info (f"Worker pod ({ worker_pod_name or 'N/A' } ) status: { worker_status } " )
127
+
128
+ if head_pod_name and worker_pod_name and "Running" in head_status and "Running" in worker_status :
129
+ head_ready = kubectl_get_pod_ready (self .namespace , head_pod_name )
130
+ worker_ready = kubectl_get_pod_ready (self .namespace , worker_pod_name )
131
+
132
+ if head_ready and worker_ready :
133
+ logger .info ("All discovered pods and containers are ready!" )
134
+ break
135
+ else :
136
+ logger .info ("Discovered pods are running but containers are not all ready yet..." )
137
+ if not head_ready and head_pod_name :
138
+ head_container_status = kubectl_get_pod_container_status (self .namespace , head_pod_name )
139
+ logger .info (f"Head pod ({ head_pod_name } ) container status: { head_container_status } " )
140
+ if not worker_ready and worker_pod_name :
141
+ worker_container_status = kubectl_get_pod_container_status (self .namespace , worker_pod_name )
142
+ logger .info (f"Worker pod ({ worker_pod_name } ) container status: { worker_container_status } " )
143
+ elif (head_pod_name and "Error" in head_status ) or \
144
+ (worker_pod_name and "Error" in worker_status ):
145
+ logger .error ("Error getting pod status for one or more pods, retrying..." )
146
+ else :
147
+ logger .info (f"Waiting for pods to be discovered and running... Current status - Head ({ head_pod_name or 'N/A' } ): { head_status } , Worker ({ worker_pod_name or 'N/A' } ): { worker_status } " )
148
+
149
+ time .sleep (10 )
150
+
151
+ if time .time () >= END :
152
+ logger .error ("Timeout waiting for pods to be ready or discovered" )
153
+ if not head_pod_name or not worker_pod_name :
154
+ logger .error ("Could not discover head and/or worker pods by name substring. Listing all pods in namespace for debugging:" )
155
+ try :
156
+ all_pods_result = subprocess .run (
157
+ ["kubectl" , "get" , "pods" , "-n" , self .namespace , "-o" , "wide" ],
158
+ capture_output = True , text = True , check = False
159
+ )
160
+ logger .error (f"Pods in namespace '{ self .namespace } ':\n { all_pods_result .stdout } " )
161
+ if all_pods_result .stderr :
162
+ logger .error (f"Error listing pods: { all_pods_result .stderr } " )
163
+ except Exception as e_pods :
164
+ logger .error (f"Exception while trying to list all pods: { e_pods } " )
165
+
166
+ if head_pod_name :
167
+ logger .error (f"Final head pod ({ head_pod_name } ) status: { kubectl_get_pod_container_status (self .namespace , head_pod_name )} " )
168
+ else :
169
+ logger .error (f"Final head pod status: Not Discovered by searching for '{ cluster_name } ' and 'head' in pod names." )
170
+
171
+ if worker_pod_name :
172
+ logger .error (f"Final worker pod ({ worker_pod_name } ) status: { kubectl_get_pod_container_status (self .namespace , worker_pod_name )} " )
173
+ else :
174
+ logger .error (f"Final worker pod status: Not Discovered by searching for '{ cluster_name } ' and 'worker' in pod names." )
175
+ raise TimeoutError ("Pods did not become ready (or were not discovered by name substring) within the timeout period" )
176
+
87
177
logger .info ("Generating TLS certificates..." )
88
178
generate_cert .generate_tls_cert (cluster_name , self .namespace )
89
179
logger .info ("TLS certificates generated" )
@@ -107,13 +197,9 @@ def run_local_interactives(
107
197
cluster_uri = cluster .cluster_uri ()
108
198
logger .info (f"Cluster URI: { cluster_uri } " )
109
199
110
- logger .info ("Shutting down any existing Ray connections..." )
111
- ray .shutdown ()
112
- logger .info ("Ray shutdown completed" )
113
-
114
200
logger .info ("Initializing Ray connection..." )
115
201
try :
116
- ray .init (address = client_url , logging_level = "DEBUG " )
202
+ ray .init (address = client_url , logging_level = "INFO " )
117
203
logger .info ("Ray initialization successful" )
118
204
except Exception as e :
119
205
logger .error (f"Ray initialization failed: { str (e )} " )
0 commit comments