@@ -70,7 +70,7 @@ def __init__(self, config: ClusterConfiguration):
70
70
self .config = config
71
71
self .app_wrapper_yaml = self .create_app_wrapper ()
72
72
self .app_wrapper_name = self .app_wrapper_yaml .split ("." )[0 ]
73
- self ._client = None
73
+ self ._job_submission_client = None
74
74
75
75
@property
76
76
def _client_headers (self ):
@@ -86,23 +86,25 @@ def _client_verify_tls(self):
86
86
return not self .config .openshift_oauth
87
87
88
88
@property
89
- def client (self ):
90
- if self ._client :
91
- return self ._client
89
+ def job_client (self ):
90
+ if self ._job_submission_client :
91
+ return self ._job_submission_client
92
92
if self .config .openshift_oauth :
93
93
print (
94
94
api_config_handler ().configuration .get_api_key_with_prefix (
95
95
"authorization"
96
96
)
97
97
)
98
- self ._client = JobSubmissionClient (
98
+ self ._job_submission_client = JobSubmissionClient (
99
99
self .cluster_dashboard_uri (),
100
100
headers = self ._client_headers ,
101
101
verify = self ._client_verify_tls ,
102
102
)
103
103
else :
104
- self ._client = JobSubmissionClient (self .cluster_dashboard_uri ())
105
- return self ._client
104
+ self ._job_submission_client = JobSubmissionClient (
105
+ self .cluster_dashboard_uri ()
106
+ )
107
+ return self ._job_submission_client
106
108
107
109
def evaluate_dispatch_priority (self ):
108
110
priority_class = self .config .dispatch_priority
@@ -141,6 +143,10 @@ def create_app_wrapper(self):
141
143
142
144
# Before attempting to create the cluster AW, let's evaluate the ClusterConfig
143
145
if self .config .dispatch_priority :
146
+ if not self .config .mcad :
147
+ raise ValueError (
148
+ "Invalid Cluster Configuration, cannot have dispatch priority without MCAD"
149
+ )
144
150
priority_val = self .evaluate_dispatch_priority ()
145
151
if priority_val == None :
146
152
raise ValueError (
@@ -163,6 +169,7 @@ def create_app_wrapper(self):
163
169
template = self .config .template
164
170
image = self .config .image
165
171
instascale = self .config .instascale
172
+ mcad = self .config .mcad
166
173
instance_types = self .config .machine_types
167
174
env = self .config .envs
168
175
local_interactive = self .config .local_interactive
@@ -183,6 +190,7 @@ def create_app_wrapper(self):
183
190
template = template ,
184
191
image = image ,
185
192
instascale = instascale ,
193
+ mcad = mcad ,
186
194
instance_types = instance_types ,
187
195
env = env ,
188
196
local_interactive = local_interactive ,
@@ -207,15 +215,18 @@ def up(self):
207
215
try :
208
216
config_check ()
209
217
api_instance = client .CustomObjectsApi (api_config_handler ())
210
- with open (self .app_wrapper_yaml ) as f :
211
- aw = yaml .load (f , Loader = yaml .FullLoader )
212
- api_instance .create_namespaced_custom_object (
213
- group = "workload.codeflare.dev" ,
214
- version = "v1beta1" ,
215
- namespace = namespace ,
216
- plural = "appwrappers" ,
217
- body = aw ,
218
- )
218
+ if self .config .mcad :
219
+ with open (self .app_wrapper_yaml ) as f :
220
+ aw = yaml .load (f , Loader = yaml .FullLoader )
221
+ api_instance .create_namespaced_custom_object (
222
+ group = "workload.codeflare.dev" ,
223
+ version = "v1beta1" ,
224
+ namespace = namespace ,
225
+ plural = "appwrappers" ,
226
+ body = aw ,
227
+ )
228
+ else :
229
+ self ._component_resources_up (namespace , api_instance )
219
230
except Exception as e : # pragma: no cover
220
231
return _kube_api_error_handling (e )
221
232
@@ -228,13 +239,16 @@ def down(self):
228
239
try :
229
240
config_check ()
230
241
api_instance = client .CustomObjectsApi (api_config_handler ())
231
- api_instance .delete_namespaced_custom_object (
232
- group = "workload.codeflare.dev" ,
233
- version = "v1beta1" ,
234
- namespace = namespace ,
235
- plural = "appwrappers" ,
236
- name = self .app_wrapper_name ,
237
- )
242
+ if self .config .mcad :
243
+ api_instance .delete_namespaced_custom_object (
244
+ group = "workload.codeflare.dev" ,
245
+ version = "v1beta1" ,
246
+ namespace = namespace ,
247
+ plural = "appwrappers" ,
248
+ name = self .app_wrapper_name ,
249
+ )
250
+ else :
251
+ self ._component_resources_down (namespace , api_instance )
238
252
except Exception as e : # pragma: no cover
239
253
return _kube_api_error_handling (e )
240
254
@@ -252,42 +266,46 @@ def status(
252
266
"""
253
267
ready = False
254
268
status = CodeFlareClusterStatus .UNKNOWN
255
- # check the app wrapper status
256
- appwrapper = _app_wrapper_status (self .config .name , self .config .namespace )
257
- if appwrapper :
258
- if appwrapper .status in [
259
- AppWrapperStatus .RUNNING ,
260
- AppWrapperStatus .COMPLETED ,
261
- AppWrapperStatus .RUNNING_HOLD_COMPLETION ,
262
- ]:
263
- ready = False
264
- status = CodeFlareClusterStatus .STARTING
265
- elif appwrapper .status in [
266
- AppWrapperStatus .FAILED ,
267
- AppWrapperStatus .DELETED ,
268
- ]:
269
- ready = False
270
- status = CodeFlareClusterStatus .FAILED # should deleted be separate
271
- return status , ready # exit early, no need to check ray status
272
- elif appwrapper .status in [
273
- AppWrapperStatus .PENDING ,
274
- AppWrapperStatus .QUEUEING ,
275
- ]:
276
- ready = False
277
- if appwrapper .status == AppWrapperStatus .PENDING :
278
- status = CodeFlareClusterStatus .QUEUED
279
- else :
280
- status = CodeFlareClusterStatus .QUEUEING
281
- if print_to_console :
282
- pretty_print .print_app_wrappers_status ([appwrapper ])
283
- return (
284
- status ,
285
- ready ,
286
- ) # no need to check the ray status since still in queue
269
+ if self .config .mcad :
270
+ # check the app wrapper status
271
+ appwrapper = _app_wrapper_status (self .config .name , self .config .namespace )
272
+ if appwrapper :
273
+ if appwrapper .status in [
274
+ AppWrapperStatus .RUNNING ,
275
+ AppWrapperStatus .COMPLETED ,
276
+ AppWrapperStatus .RUNNING_HOLD_COMPLETION ,
277
+ ]:
278
+ ready = False
279
+ status = CodeFlareClusterStatus .STARTING
280
+ elif appwrapper .status in [
281
+ AppWrapperStatus .FAILED ,
282
+ AppWrapperStatus .DELETED ,
283
+ ]:
284
+ ready = False
285
+ status = CodeFlareClusterStatus .FAILED # should deleted be separate
286
+ return status , ready # exit early, no need to check ray status
287
+ elif appwrapper .status in [
288
+ AppWrapperStatus .PENDING ,
289
+ AppWrapperStatus .QUEUEING ,
290
+ ]:
291
+ ready = False
292
+ if appwrapper .status == AppWrapperStatus .PENDING :
293
+ status = CodeFlareClusterStatus .QUEUED
294
+ else :
295
+ status = CodeFlareClusterStatus .QUEUEING
296
+ if print_to_console :
297
+ pretty_print .print_app_wrappers_status ([appwrapper ])
298
+ return (
299
+ status ,
300
+ ready ,
301
+ ) # no need to check the ray status since still in queue
287
302
288
303
# check the ray cluster status
289
304
cluster = _ray_cluster_status (self .config .name , self .config .namespace )
290
- if cluster and not cluster .status == RayClusterStatus .UNKNOWN :
305
+ if cluster :
306
+ if cluster .status == RayClusterStatus .UNKNOWN :
307
+ ready = False
308
+ status = CodeFlareClusterStatus .STARTING
291
309
if cluster .status == RayClusterStatus .READY :
292
310
ready = True
293
311
status = CodeFlareClusterStatus .READY
@@ -407,19 +425,19 @@ def list_jobs(self) -> List:
407
425
"""
408
426
This method accesses the head ray node in your cluster and lists the running jobs.
409
427
"""
410
- return self .client .list_jobs ()
428
+ return self .job_client .list_jobs ()
411
429
412
430
def job_status (self , job_id : str ) -> str :
413
431
"""
414
432
This method accesses the head ray node in your cluster and returns the job status for the provided job id.
415
433
"""
416
- return self .client .get_job_status (job_id )
434
+ return self .job_client .get_job_status (job_id )
417
435
418
436
def job_logs (self , job_id : str ) -> str :
419
437
"""
420
438
This method accesses the head ray node in your cluster and returns the logs for the provided job id.
421
439
"""
422
- return self .client .get_job_logs (job_id )
440
+ return self .job_client .get_job_logs (job_id )
423
441
424
442
def torchx_config (
425
443
self , working_dir : str = None , requirements : str = None
@@ -435,7 +453,7 @@ def torchx_config(
435
453
to_return ["requirements" ] = requirements
436
454
return to_return
437
455
438
- def from_k8_cluster_object (rc ):
456
+ def from_k8_cluster_object (rc , mcad = True ):
439
457
machine_types = (
440
458
rc ["metadata" ]["labels" ]["orderedinstance" ].split ("_" )
441
459
if "orderedinstance" in rc ["metadata" ]["labels" ]
@@ -474,6 +492,7 @@ def from_k8_cluster_object(rc):
474
492
0
475
493
]["image" ],
476
494
local_interactive = local_interactive ,
495
+ mcad = mcad ,
477
496
)
478
497
return Cluster (cluster_config )
479
498
@@ -484,6 +503,66 @@ def local_client_url(self):
484
503
else :
485
504
return "None"
486
505
506
+ def _component_resources_up (
507
+ self , namespace : str , api_instance : client .CustomObjectsApi
508
+ ):
509
+ with open (self .app_wrapper_yaml ) as f :
510
+ yamls = yaml .load_all (f , Loader = yaml .FullLoader )
511
+ for resource in yamls :
512
+ if resource ["kind" ] == "RayCluster" :
513
+ api_instance .create_namespaced_custom_object (
514
+ group = "ray.io" ,
515
+ version = "v1alpha1" ,
516
+ namespace = namespace ,
517
+ plural = "rayclusters" ,
518
+ body = resource ,
519
+ )
520
+ elif resource ["kind" ] == "Route" :
521
+ api_instance .create_namespaced_custom_object (
522
+ group = "route.openshift.io" ,
523
+ version = "v1" ,
524
+ namespace = namespace ,
525
+ plural = "routes" ,
526
+ body = resource ,
527
+ )
528
+ elif resource ["kind" ] == "Secret" :
529
+ secret_instance = client .CoreV1Api (api_config_handler ())
530
+ secret_instance .create_namespaced_secret (
531
+ namespace = namespace ,
532
+ body = resource ,
533
+ )
534
+
535
+ def _component_resources_down (
536
+ self , namespace : str , api_instance : client .CustomObjectsApi
537
+ ):
538
+ with open (self .app_wrapper_yaml ) as f :
539
+ yamls = yaml .load_all (f , Loader = yaml .FullLoader )
540
+ for resource in yamls :
541
+ if resource ["kind" ] == "RayCluster" :
542
+ api_instance .delete_namespaced_custom_object (
543
+ group = "ray.io" ,
544
+ version = "v1alpha1" ,
545
+ namespace = namespace ,
546
+ plural = "rayclusters" ,
547
+ name = self .app_wrapper_name ,
548
+ )
549
+ elif resource ["kind" ] == "Route" :
550
+ name = resource ["metadata" ]["name" ]
551
+ api_instance .delete_namespaced_custom_object (
552
+ group = "route.openshift.io" ,
553
+ version = "v1" ,
554
+ namespace = namespace ,
555
+ plural = "routes" ,
556
+ name = name ,
557
+ )
558
+ elif resource ["kind" ] == "Secret" :
559
+ name = resource ["metadata" ]["name" ]
560
+ secret_instance = client .CoreV1Api (api_config_handler ())
561
+ secret_instance .delete_namespaced_secret (
562
+ namespace = namespace ,
563
+ name = name ,
564
+ )
565
+
487
566
488
567
def list_all_clusters (namespace : str , print_to_console : bool = True ):
489
568
"""
@@ -549,13 +628,33 @@ def get_cluster(cluster_name: str, namespace: str = "default"):
549
628
550
629
for rc in rcs ["items" ]:
551
630
if rc ["metadata" ]["name" ] == cluster_name :
552
- return Cluster .from_k8_cluster_object (rc )
631
+ mcad = _check_aw_exists (cluster_name , namespace )
632
+ return Cluster .from_k8_cluster_object (rc , mcad = mcad )
553
633
raise FileNotFoundError (
554
634
f"Cluster { cluster_name } is not found in { namespace } namespace"
555
635
)
556
636
557
637
558
638
# private methods
639
+ def _check_aw_exists (name : str , namespace : str ) -> bool :
640
+ try :
641
+ config_check ()
642
+ api_instance = client .CustomObjectsApi (api_config_handler ())
643
+ aws = api_instance .list_namespaced_custom_object (
644
+ group = "workload.codeflare.dev" ,
645
+ version = "v1beta1" ,
646
+ namespace = namespace ,
647
+ plural = "appwrappers" ,
648
+ )
649
+ except Exception as e : # pragma: no cover
650
+ return _kube_api_error_handling (e , print_error = False )
651
+
652
+ for aw in aws ["items" ]:
653
+ if aw ["metadata" ]["name" ] == name :
654
+ return True
655
+ return False
656
+
657
+
559
658
def _get_ingress_domain ():
560
659
try :
561
660
config_check ()
@@ -660,6 +759,7 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
660
759
661
760
config_check ()
662
761
api_instance = client .CustomObjectsApi (api_config_handler ())
762
+ # UPDATE THIS
663
763
routes = api_instance .list_namespaced_custom_object (
664
764
group = "route.openshift.io" ,
665
765
version = "v1" ,
0 commit comments