8
8
import pytest
9
9
import ray
10
10
import math
11
+ import logging
12
+ import time
13
+ import os
11
14
12
15
from support import *
13
16
17
+ # Configure logging
18
+ logging .basicConfig (level = logging .DEBUG )
19
+ logger = logging .getLogger (__name__ )
20
+
14
21
15
22
@pytest .mark .kind
16
23
class TestRayLocalInteractiveOauth :
17
24
def setup_method (self ):
25
+ logger .info ("Setting up test environment..." )
18
26
initialize_kubernetes_client (self )
27
+ logger .info ("Kubernetes client initialized" )
19
28
20
29
def teardown_method (self ):
30
+ logger .info ("Cleaning up test environment..." )
21
31
delete_namespace (self )
22
32
delete_kueue_resources (self )
33
+ logger .info ("Cleanup completed" )
23
34
24
35
def test_local_interactives (self ):
36
+ logger .info ("Starting test_local_interactives..." )
25
37
self .setup_method ()
26
38
create_namespace (self )
27
39
create_kueue_resources (self )
28
40
self .run_local_interactives ()
41
+ logger .info ("test_local_interactives completed" )
29
42
30
43
@pytest .mark .nvidia_gpu
31
44
def test_local_interactives_nvidia_gpu (self ):
45
+ logger .info ("Starting test_local_interactives_nvidia_gpu..." )
32
46
self .setup_method ()
33
47
create_namespace (self )
34
48
create_kueue_resources (self )
35
49
self .run_local_interactives (number_of_gpus = 1 )
50
+ logger .info ("test_local_interactives_nvidia_gpu completed" )
36
51
37
52
def run_local_interactives (
38
53
self , gpu_resource_name = "nvidia.com/gpu" , number_of_gpus = 0
39
54
):
40
55
cluster_name = "test-ray-cluster-li"
56
+ logger .info (f"Starting run_local_interactives with { number_of_gpus } GPUs" )
41
57
58
+ logger .info ("Creating cluster configuration..." )
42
59
cluster = Cluster (
43
60
ClusterConfiguration (
44
61
name = cluster_name ,
@@ -57,37 +74,97 @@ def run_local_interactives(
57
74
verify_tls = False ,
58
75
)
59
76
)
77
+ logger .info ("Cluster configuration created" )
78
+
79
+ logger .info ("Starting cluster deployment..." )
60
80
cluster .up ()
81
+ logger .info ("Cluster deployment initiated" )
82
+
83
+ logger .info ("Waiting for cluster to be ready..." )
61
84
cluster .wait_ready ()
85
+ logger .info ("Cluster is ready" )
62
86
87
+ logger .info ("Generating TLS certificates..." )
63
88
generate_cert .generate_tls_cert (cluster_name , self .namespace )
89
+ logger .info ("TLS certificates generated" )
90
+
91
+ logger .info ("Exporting environment variables..." )
64
92
generate_cert .export_env (cluster_name , self .namespace )
93
+ logger .info ("Environment variables exported" )
94
+
95
+ client_url = cluster .local_client_url ()
96
+ logger .info (f"Ray client URL: { client_url } " )
65
97
66
- print (cluster .local_client_url ())
98
+ logger .info ("Checking cluster status..." )
99
+ status = cluster .status ()
100
+ logger .info (f"Cluster status: { status } " )
67
101
102
+ logger .info ("Checking cluster dashboard URI..." )
103
+ dashboard_uri = cluster .cluster_dashboard_uri ()
104
+ logger .info (f"Dashboard URI: { dashboard_uri } " )
105
+
106
+ logger .info ("Checking cluster URI..." )
107
+ cluster_uri = cluster .cluster_uri ()
108
+ logger .info (f"Cluster URI: { cluster_uri } " )
109
+
110
+ logger .info ("Shutting down any existing Ray connections..." )
68
111
ray .shutdown ()
69
- ray .init (address = cluster .local_client_url (), logging_level = "DEBUG" )
112
+ logger .info ("Ray shutdown completed" )
113
+
114
+ logger .info ("Initializing Ray connection..." )
115
+ try :
116
+ ray .init (address = client_url , logging_level = "DEBUG" )
117
+ logger .info ("Ray initialization successful" )
118
+ except Exception as e :
119
+ logger .error (f"Ray initialization failed: { str (e )} " )
120
+ logger .error (f"Error type: { type (e )} " )
121
+ raise
122
+
123
+ logger .info ("Defining Ray remote functions..." )
70
124
71
125
@ray .remote (num_gpus = number_of_gpus / 2 )
72
126
def heavy_calculation_part (num_iterations ):
127
+ logger .info (
128
+ f"Starting heavy_calculation_part with { num_iterations } iterations"
129
+ )
73
130
result = 0.0
74
131
for i in range (num_iterations ):
75
132
for j in range (num_iterations ):
76
133
for k in range (num_iterations ):
77
134
result += math .sin (i ) * math .cos (j ) * math .tan (k )
135
+ logger .info ("heavy_calculation_part completed" )
78
136
return result
79
137
80
138
@ray .remote (num_gpus = number_of_gpus / 2 )
81
139
def heavy_calculation (num_iterations ):
140
+ logger .info (f"Starting heavy_calculation with { num_iterations } iterations" )
82
141
results = ray .get (
83
142
[heavy_calculation_part .remote (num_iterations // 30 ) for _ in range (30 )]
84
143
)
144
+ logger .info ("heavy_calculation completed" )
85
145
return sum (results )
86
146
147
+ logger .info ("Submitting calculation task..." )
87
148
ref = heavy_calculation .remote (3000 )
88
- result = ray .get (ref )
89
- assert result == 1789.4644387076714
90
- ray .cancel (ref )
149
+ logger .info ("Task submitted, waiting for result..." )
150
+
151
+ try :
152
+ result = ray .get (ref )
153
+ logger .info (f"Calculation completed with result: { result } " )
154
+ assert result == 1789.4644387076714
155
+ logger .info ("Result assertion passed" )
156
+ except Exception as e :
157
+ logger .error (f"Error during calculation: { str (e )} " )
158
+ raise
159
+ finally :
160
+ logger .info ("Cancelling task reference..." )
161
+ ray .cancel (ref )
162
+ logger .info ("Task cancelled" )
163
+
164
+ logger .info ("Shutting down Ray..." )
91
165
ray .shutdown ()
166
+ logger .info ("Ray shutdown completed" )
92
167
168
+ logger .info ("Tearing down cluster..." )
93
169
cluster .down ()
170
+ logger .info ("Cluster teardown completed" )
0 commit comments