Skip to content

Commit 9050740

Browse files
committed
CLI Layout and Create RayCluster function (#227)
* Create: base and file layout for CLI * Add: Create raycluster command for CLI * Refactor: refactor CLI using pre-commit * Test: unit tests for create raycluster function in the CLI * Update: update egg-info with more paths * Change: change Framework Cluster to RayCluster * merge: rebase with main * Fix: unit tests * Change: create cluster to define cluster in unit tests * Add: error handling for invalid command * test: change tests so cli cluster definition has its own yaml file
1 parent cecae82 commit 9050740

File tree

10 files changed

+327
-0
lines changed

10 files changed

+327
-0
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ We use pre-commit to make sure the code is consistently formatted. To make sure
3333
- To run the unit tests, run `pytest -v tests/unit_test.py`
3434
- Any new test functions/scripts can be added into the `tests` folder
3535
- NOTE: Functional tests coming soon, will live in `tests/func_test.py`
36+
- To test CLI, run `codeflare` followed by any command. To see list of commands, simply run `codeflare`
3637

3738
#### Code Coverage
3839

pyproject.toml

+8
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ codeflare-torchx = "0.6.0.dev0"
2929
cryptography = "40.0.2"
3030
executing = "1.2.0"
3131
pydantic = "< 2"
32+
click = "8.0.4"
3233

3334
[tool.poetry.group.docs]
3435
optional = true
@@ -40,3 +41,10 @@ pdoc3 = "0.10.0"
4041
pytest = "7.4.0"
4142
coverage = "7.2.7"
4243
pytest-mock = "3.11.1"
44+
45+
[tool.poetry.scripts]
46+
codeflare = "codeflare_sdk.cli.codeflare_cli:cli"
47+
48+
[build-system]
49+
requires = ["poetry_core>=1.0.0"]
50+
build-backend = "poetry.core.masonry.api"

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@ codeflare-torchx==0.6.0.dev0
66
pydantic<2 # 2.0+ broke ray[default] see detail: https://github.com/ray-project/ray/pull/37000
77
cryptography==40.0.2
88
executing==1.2.0
9+
click==8.0.4

src/codeflare_sdk.egg-info/SOURCES.txt

+4
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,7 @@ src/codeflare_sdk/utils/generate_cert.py
1919
src/codeflare_sdk/utils/generate_yaml.py
2020
src/codeflare_sdk/utils/kube_api_helpers.py
2121
src/codeflare_sdk/utils/pretty_print.py
22+
src/codeflare_sdk/cli/__init__.py
23+
src/codeflare_sdk/cli/codeflare_cli.py
24+
src/codeflare_sdk/cli/commands/create.py
25+
src/codeflare_sdk/cli/cli_utils.py

src/codeflare_sdk/cli/__init__.py

Whitespace-only changes.

src/codeflare_sdk/cli/cli_utils.py

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import ast
2+
import click
3+
4+
5+
class PythonLiteralOption(click.Option):
6+
def type_cast_value(self, ctx, value):
7+
try:
8+
if not value:
9+
return None
10+
return ast.literal_eval(value)
11+
except:
12+
raise click.BadParameter(value)
+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import click
2+
import sys
3+
import os
4+
5+
cmd_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "commands"))
6+
7+
8+
class CodeflareCLI(click.MultiCommand):
9+
def list_commands(self, ctx):
10+
rv = []
11+
for filename in os.listdir(cmd_folder):
12+
if filename.endswith(".py") and filename != "__init__.py":
13+
rv.append(filename[:-3])
14+
rv.sort()
15+
return rv
16+
17+
def get_command(self, ctx, name):
18+
ns = {}
19+
fn = os.path.join(cmd_folder, name + ".py")
20+
try:
21+
with open(fn) as f:
22+
code = compile(f.read(), fn, "exec")
23+
eval(code, ns, ns)
24+
return ns["cli"]
25+
except FileNotFoundError:
26+
return
27+
28+
29+
@click.command(cls=CodeflareCLI)
30+
@click.pass_context
31+
def cli(ctx):
32+
pass
33+
34+
35+
if __name__ == "__main__":
36+
cli()
+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import click
2+
3+
from codeflare_sdk.cluster.cluster import Cluster
4+
from codeflare_sdk.cluster.config import ClusterConfiguration
5+
from codeflare_sdk.cli.cli_utils import PythonLiteralOption
6+
7+
8+
@click.group()
9+
def cli():
10+
"""Define a resource with parameter specifications"""
11+
pass
12+
13+
14+
@cli.command()
15+
@click.option("--name", type=str, required=True)
16+
@click.option("--namespace", "-n", type=str)
17+
@click.option("--head_info", cls=PythonLiteralOption, type=list)
18+
@click.option("--machine_types", cls=PythonLiteralOption, type=list)
19+
@click.option("--min_cpus", type=int)
20+
@click.option("--max_cpus", type=int)
21+
@click.option("--min_worker", type=int)
22+
@click.option("--max_worker", type=int)
23+
@click.option("--min_memory", type=int)
24+
@click.option("--max_memory", type=int)
25+
@click.option("--gpu", type=int)
26+
@click.option("--template", type=str)
27+
@click.option("--instascale", type=bool)
28+
@click.option("--envs", cls=PythonLiteralOption, type=dict)
29+
@click.option("--image", type=str)
30+
@click.option("--local_interactive", type=bool)
31+
@click.option("--image_pull_secrets", cls=PythonLiteralOption, type=list)
32+
def raycluster(**kwargs):
33+
"""Define a RayCluster with parameter specifications"""
34+
filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None}
35+
clusterConfig = ClusterConfiguration(**filtered_kwargs)
36+
Cluster(clusterConfig) # Creates yaml file

tests/cli-test-case.yaml

+195
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
apiVersion: mcad.ibm.com/v1beta1
2+
kind: AppWrapper
3+
metadata:
4+
labels:
5+
orderedinstance: cpu.small_gpu.large
6+
name: cli-test-cluster
7+
namespace: ns
8+
spec:
9+
priority: 9
10+
resources:
11+
GenericItems:
12+
- custompodresources:
13+
- limits:
14+
cpu: 2
15+
memory: 8G
16+
nvidia.com/gpu: 0
17+
replicas: 1
18+
requests:
19+
cpu: 2
20+
memory: 8G
21+
nvidia.com/gpu: 0
22+
- limits:
23+
cpu: 4
24+
memory: 6G
25+
nvidia.com/gpu: 7
26+
replicas: 2
27+
requests:
28+
cpu: 3
29+
memory: 5G
30+
nvidia.com/gpu: 7
31+
generictemplate:
32+
apiVersion: ray.io/v1alpha1
33+
kind: RayCluster
34+
metadata:
35+
labels:
36+
appwrapper.mcad.ibm.com: cli-test-cluster
37+
controller-tools.k8s.io: '1.0'
38+
name: cli-test-cluster
39+
namespace: ns
40+
spec:
41+
autoscalerOptions:
42+
idleTimeoutSeconds: 60
43+
imagePullPolicy: Always
44+
resources:
45+
limits:
46+
cpu: 500m
47+
memory: 512Mi
48+
requests:
49+
cpu: 500m
50+
memory: 512Mi
51+
upscalingMode: Default
52+
enableInTreeAutoscaling: false
53+
headGroupSpec:
54+
rayStartParams:
55+
block: 'true'
56+
dashboard-host: 0.0.0.0
57+
num-gpus: '0'
58+
serviceType: ClusterIP
59+
template:
60+
spec:
61+
affinity:
62+
nodeAffinity:
63+
requiredDuringSchedulingIgnoredDuringExecution:
64+
nodeSelectorTerms:
65+
- matchExpressions:
66+
- key: cli-test-cluster
67+
operator: In
68+
values:
69+
- cli-test-cluster
70+
containers:
71+
- env:
72+
- name: MY_POD_IP
73+
valueFrom:
74+
fieldRef:
75+
fieldPath: status.podIP
76+
- name: RAY_USE_TLS
77+
value: '0'
78+
- name: RAY_TLS_SERVER_CERT
79+
value: /home/ray/workspace/tls/server.crt
80+
- name: RAY_TLS_SERVER_KEY
81+
value: /home/ray/workspace/tls/server.key
82+
- name: RAY_TLS_CA_CERT
83+
value: /home/ray/workspace/tls/ca.crt
84+
image: quay.io/project-codeflare/ray:2.5.0-py38-cu116
85+
imagePullPolicy: Always
86+
lifecycle:
87+
preStop:
88+
exec:
89+
command:
90+
- /bin/sh
91+
- -c
92+
- ray stop
93+
name: ray-head
94+
ports:
95+
- containerPort: 6379
96+
name: gcs
97+
- containerPort: 8265
98+
name: dashboard
99+
- containerPort: 10001
100+
name: client
101+
resources:
102+
limits:
103+
cpu: 2
104+
memory: 8G
105+
nvidia.com/gpu: 0
106+
requests:
107+
cpu: 2
108+
memory: 8G
109+
nvidia.com/gpu: 0
110+
imagePullSecrets:
111+
- name: cli-test-pull-secret
112+
rayVersion: 2.1.0
113+
workerGroupSpecs:
114+
- groupName: small-group-cli-test-cluster
115+
maxReplicas: 2
116+
minReplicas: 2
117+
rayStartParams:
118+
block: 'true'
119+
num-gpus: '7'
120+
replicas: 2
121+
template:
122+
metadata:
123+
annotations:
124+
key: value
125+
labels:
126+
key: value
127+
spec:
128+
affinity:
129+
nodeAffinity:
130+
requiredDuringSchedulingIgnoredDuringExecution:
131+
nodeSelectorTerms:
132+
- matchExpressions:
133+
- key: cli-test-cluster
134+
operator: In
135+
values:
136+
- cli-test-cluster
137+
containers:
138+
- env:
139+
- name: MY_POD_IP
140+
valueFrom:
141+
fieldRef:
142+
fieldPath: status.podIP
143+
- name: RAY_USE_TLS
144+
value: '0'
145+
- name: RAY_TLS_SERVER_CERT
146+
value: /home/ray/workspace/tls/server.crt
147+
- name: RAY_TLS_SERVER_KEY
148+
value: /home/ray/workspace/tls/server.key
149+
- name: RAY_TLS_CA_CERT
150+
value: /home/ray/workspace/tls/ca.crt
151+
image: quay.io/project-codeflare/ray:2.5.0-py38-cu116
152+
lifecycle:
153+
preStop:
154+
exec:
155+
command:
156+
- /bin/sh
157+
- -c
158+
- ray stop
159+
name: machine-learning
160+
resources:
161+
limits:
162+
cpu: 4
163+
memory: 6G
164+
nvidia.com/gpu: 7
165+
requests:
166+
cpu: 3
167+
memory: 5G
168+
nvidia.com/gpu: 7
169+
imagePullSecrets:
170+
- name: cli-test-pull-secret
171+
initContainers:
172+
- command:
173+
- sh
174+
- -c
175+
- until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local;
176+
do echo waiting for myservice; sleep 2; done
177+
image: busybox:1.28
178+
name: init-myservice
179+
replicas: 1
180+
- generictemplate:
181+
apiVersion: route.openshift.io/v1
182+
kind: Route
183+
metadata:
184+
labels:
185+
odh-ray-cluster-service: cli-test-cluster-head-svc
186+
name: ray-dashboard-cli-test-cluster
187+
namespace: ns
188+
spec:
189+
port:
190+
targetPort: dashboard
191+
to:
192+
kind: Service
193+
name: cli-test-cluster-head-svc
194+
replica: 1
195+
Items: []

tests/unit_test.py

+34
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import filecmp
1818
import os
1919
import re
20+
from click.testing import CliRunner
2021

2122
parent = Path(__file__).resolve().parents[1]
2223
sys.path.append(str(parent) + "/src")
@@ -63,6 +64,7 @@
6364
generate_tls_cert,
6465
export_env,
6566
)
67+
from codeflare_sdk.cli.codeflare_cli import cli
6668

6769
import openshift
6870
from openshift.selector import Selector
@@ -75,6 +77,37 @@
7577
import yaml
7678

7779

80+
# CLI testing
81+
def test_cli_working():
82+
runner = CliRunner()
83+
result = runner.invoke(cli)
84+
assert result.exit_code == 0
85+
86+
87+
def test_cluster_definition_cli():
88+
runner = CliRunner()
89+
define_cluster_command = """
90+
define raycluster
91+
--name=cli-test-cluster
92+
--namespace=ns
93+
--min_worker=1
94+
--max_worker=2
95+
--min_cpus=3
96+
--max_cpus=4
97+
--min_memory=5
98+
--max_memory=6
99+
--gpu=7
100+
--instascale=True
101+
--machine_types='["cpu.small", "gpu.large"]'
102+
--image_pull_secrets='["cli-test-pull-secret"]'
103+
"""
104+
result = runner.invoke(cli, define_cluster_command)
105+
assert result.output == "Written to: cli-test-cluster.yaml\n"
106+
assert filecmp.cmp(
107+
"cli-test-cluster.yaml", f"{parent}/tests/cli-test-case.yaml", shallow=True
108+
)
109+
110+
78111
# For mocking openshift client results
79112
fake_res = openshift.Result("fake")
80113

@@ -2222,3 +2255,4 @@ def test_cleanup():
22222255
os.remove("tls-cluster-namespace/tls.crt")
22232256
os.remove("tls-cluster-namespace/tls.key")
22242257
os.rmdir("tls-cluster-namespace")
2258+
os.remove("cli-test-cluster.yaml")

0 commit comments

Comments
 (0)