Skip to content

Commit 766e9af

Browse files
committed
init commit
1 parent 363748d commit 766e9af

9 files changed

+763
-0
lines changed

Diff for: Dockerfile

+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
FROM --platform=amd64 nvidia/cuda:11.4.3-cudnn8-devel-ubuntu20.04
2+
LABEL maintainer "Alexander Hägele <[email protected]>"
3+
4+
ENV DEBIAN_FRONTEND=noninteractive
5+
6+
# Install some necessary tools.
7+
RUN apt-get update && apt-get install -y \
8+
bzip2 \
9+
ca-certificates \
10+
cmake \
11+
curl \
12+
git \
13+
htop \
14+
libssl-dev \
15+
libffi-dev \
16+
locales \
17+
openssh-server \
18+
openssh-client \
19+
rsync \
20+
sudo \
21+
tmux \
22+
screen \
23+
unzip \
24+
vim \
25+
wget \
26+
zsh \
27+
python3 \
28+
python3-pip \
29+
keychain \
30+
&& rm -rf /var/lib/apt/lists/*
31+
32+
RUN locale-gen en_US.UTF-8
33+
ENV LANG en_US.UTF-8
34+
ENV LANGUAGE en_US:en
35+
ENV LC_ALL en_US.UTF-8
36+
37+
38+
# install oh-my-zsh
39+
# Uses "robbyrussell" theme (original Oh My Zsh theme)
40+
RUN sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended
41+
RUN chsh -s /bin/zsh root
42+
43+
# Setup env
44+
ENV PATH="usr/local/cuda/bin:${PATH}" \
45+
LD_LIBRARY_PATH="/usr/local/cuda/lib64"
46+
47+
# Make $PATH and $LD_LIBRARY PATH available to all users
48+
RUN echo PATH="${PATH}" >> /etc/environment && \
49+
echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment
50+
51+
# Seems like you need this to run Tensorflow and Jax together
52+
RUN echo TF_FORCE_GPU_ALLOW_GROWTH='true' >> /etc/environment
53+
54+
# Set a password for the root
55+
RUN echo 'root:root' | sudo chpasswd
56+
57+
# ===== Copy init files to /docker/ folder =====
58+
RUN mkdir /docker
59+
COPY utils/* /docker/
60+
RUN chmod +x /docker/*.sh
61+
62+
CMD ["/bin/zsh"]
63+
64+
# copy oh-my-zsh config over to /docker/ folder
65+
# so that it can be copied to scratch space
66+
# inside entrypoint.sh
67+
RUN cp -r /root/.oh-my-zsh/ /docker/.oh-my-zsh/
68+
RUN cp -r /root/.zshrc /docker/.zshrc
69+
RUN cp -r /root/.bashrc /docker/.bashrc
70+
RUN cp -r /root/.profile /docker/.profile
71+
72+
ENTRYPOINT ["/docker/entrypoint.sh"]

Diff for: README.md

+260
Large diffs are not rendered by default.

Diff for: csub.py

+232
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
#!/usr/bin/python3
2+
3+
import argparse
4+
from datetime import datetime, timedelta
5+
import re
6+
import subprocess
7+
import tempfile
8+
import yaml
9+
import os
10+
11+
parser = argparse.ArgumentParser(description="Cluster Submit Utility")
12+
parser.add_argument(
13+
"-n",
14+
"--name",
15+
type=str,
16+
required=False,
17+
help="Job name (has to be unique in the namespace)",
18+
)
19+
parser.add_argument(
20+
"-c",
21+
"--command",
22+
type=str,
23+
required=False,
24+
help="Command to run on the instance (default sleep for duration)",
25+
)
26+
parser.add_argument(
27+
"-t",
28+
"--time",
29+
type=str,
30+
required=False,
31+
help="The maximum duration allowed for this job (default 24h)",
32+
)
33+
parser.add_argument(
34+
"-g",
35+
"--gpus",
36+
type=int,
37+
default=1,
38+
required=False,
39+
help="The number of GPUs requested (default 1)",
40+
)
41+
parser.add_argument(
42+
"--cpus",
43+
type=int,
44+
default=4,
45+
required=False,
46+
help="The number of CPUs requested (default 4)",
47+
)
48+
parser.add_argument(
49+
"-i",
50+
"--image",
51+
type=str,
52+
required=False,
53+
default="ic-registry.epfl.ch/mlo/mlo:v1",
54+
help="The URL of the docker image that will be used for the job",
55+
)
56+
parser.add_argument(
57+
"-p",
58+
"--port",
59+
type=int,
60+
required=False,
61+
help="A cluster port for connect to this node",
62+
)
63+
parser.add_argument(
64+
"-u",
65+
"--user",
66+
type=str,
67+
default="user.yaml",
68+
help="Path to a yaml file that defines the user",
69+
)
70+
parser.add_argument(
71+
"--train",
72+
action="store_true",
73+
help="train job (default is interactive, which has higher priority)",
74+
)
75+
parser.add_argument(
76+
"-d",
77+
"--dry",
78+
action="store_true",
79+
help="Print the generated yaml file instead of submitting it",
80+
)
81+
82+
if __name__ == "__main__":
83+
args = parser.parse_args()
84+
85+
if not os.path.exists(args.user):
86+
print(
87+
f"User file {args.user} does not exist, use the template in `template/user.yaml` to create your user file."
88+
)
89+
exit(1)
90+
91+
with open(args.user, "r") as file:
92+
user_cfg = yaml.safe_load(file)
93+
94+
if args.name is None:
95+
args.name = f"{user_cfg['user']}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
96+
97+
if args.time is None:
98+
args.time = 7 * 24 * 60 * 60
99+
else:
100+
pattern = r"((?P<days>\d+)d)?((?P<hours>\d+)h)?((?P<minutes>\d+)m)?((?P<seconds>\d+)s?)?"
101+
match = re.match(pattern, args.time)
102+
parts = {k: int(v) for k, v in match.groupdict().items() if v}
103+
args.time = int(timedelta(**parts).total_seconds())
104+
105+
if args.command is None:
106+
args.command = f"sleep {args.time}"
107+
108+
if args.train:
109+
comment_out_priority = "#" # comment
110+
else:
111+
comment_out_priority = ""
112+
113+
working_dir = user_cfg["working_dir"]
114+
symlink_targets, symlink_destinations = zip(*user_cfg["symlinks"].items())
115+
symlink_targets = ":".join(
116+
[os.path.join(working_dir, target) for target in symlink_targets]
117+
)
118+
symlink_paths = ":".join(
119+
[
120+
os.path.join(f"/home/{user_cfg['user']}", dest[1])
121+
for dest in symlink_destinations
122+
]
123+
)
124+
symlink_types = ":".join([dest[0] for dest in symlink_destinations])
125+
126+
cfg = f"""
127+
# Source: runaijob/templates/runai-job.yaml
128+
apiVersion: run.ai/v1
129+
kind: RunaiJob
130+
metadata:
131+
name: {args.name}
132+
labels:
133+
{comment_out_priority}priorityClassName: "build" # Interactive Job if present, for Train Job REMOVE this line
134+
user: {user_cfg['user']}
135+
spec:
136+
template:
137+
metadata:
138+
labels:
139+
user: {user_cfg['user']}
140+
spec:
141+
hostIPC: true
142+
schedulerName: runai-scheduler
143+
restartPolicy: Never
144+
securityContext:
145+
runAsUser: {user_cfg['uid']}
146+
runAsGroup: {user_cfg['gid']}
147+
containers:
148+
- name: {args.name}
149+
image: {args.image}
150+
imagePullPolicy: Always
151+
workingDir: "/home/{user_cfg['user']}"
152+
securityContext:
153+
allowPrivilegeEscalation: true
154+
stdin:
155+
tty:
156+
args: [
157+
"/bin/bash",
158+
"-c",
159+
"{args.command}",
160+
]
161+
env:
162+
- name: HOME
163+
value: "/home/{user_cfg['user']}"
164+
- name: NB_USER
165+
value: {user_cfg['user']}
166+
- name: NB_UID
167+
value: "{user_cfg['uid']}"
168+
- name: NB_GROUP
169+
value: {user_cfg['group']}
170+
- name: NB_GID
171+
value: "{user_cfg['gid']}"
172+
- name: WORKING_DIR
173+
value: "{working_dir}"
174+
- name: SYMLINK_TARGETS
175+
value: "{symlink_targets}"
176+
- name: SYMLINK_PATHS
177+
value: "{symlink_paths}"
178+
- name: SYMLINK_TYPES
179+
value: "{symlink_types}"
180+
- name: WANDB_API_KEY
181+
value: {user_cfg['wandb_api_key']}
182+
- name: EPFML_LDAP
183+
value: {user_cfg['user']}
184+
resources:
185+
limits:
186+
nvidia.com/gpu: {args.gpus}
187+
requests:
188+
cpu: {args.cpus}
189+
volumeMounts:
190+
- mountPath: /mloscratch
191+
name: mloscratch
192+
- mountPath: /dev/shm # Increase shared memory size
193+
name: dshm
194+
ports:
195+
- protocol: 'TCP'
196+
containerPort: 22
197+
volumes:
198+
- name: mloscratch
199+
persistentVolumeClaim:
200+
claimName: runai-mlo-{user_cfg['user']}-scratch
201+
- name: dshm # Increase the shared memory size
202+
emptyDir:
203+
medium: Memory
204+
# nodeSelector:
205+
# run.ai/type: G10
206+
"""
207+
208+
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml") as f:
209+
f.write(cfg)
210+
f.flush()
211+
if args.dry:
212+
print(cfg)
213+
else:
214+
result = subprocess.run(
215+
["kubectl", "create", "-f", f.name],
216+
# check=True,
217+
capture_output=True,
218+
# text=True,
219+
)
220+
print(result.stdout)
221+
print(result.stderr)
222+
223+
print("\nThe following commands may come in handy:")
224+
print(f"runai bash {args.name} - opens an interactive shell on the pod")
225+
print(
226+
f"runai delete job {args.name} - kills the job and removes it from the list of jobs"
227+
)
228+
print(
229+
f"runai describe job {args.name} - shows information on the status/execution of the job"
230+
)
231+
print("runai list jobs - list all jobs and their status")
232+
print(f"runai logs {args.name} - shows the output/logs for the job")

Diff for: kubeconfig.yaml

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
apiVersion: v1
2+
clusters:
3+
- cluster:
4+
certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURCVENDQWUyZ0F3SUJBZ0lJRUQxZmRDUWZIdGt3RFFZSktvWklodmNOQVFFTEJRQXdGVEVUTUJFR0ExVUUKQXhNS2EzVmlaWEp1WlhSbGN6QWVGdzB5TXpBNU1ESXhNakl6TURoYUZ3MHpNekE0TXpBeE1qSXpNRGhhTUJVeApFekFSQmdOVkJBTVRDbXQxWW1WeWJtVjBaWE13Z2dFaU1BMEdDU3FHU0liM0RRRUJBUVVBQTRJQkR3QXdnZ0VLCkFvSUJBUUQxSHBDVU90K0FtUXo0UUR2Z2JBcW9SNXJEVzBhMkdFTDQzT0hFZk9oSVg3NUxlcVoxbEdRclVWNHcKeGNYcno0dzdGbUt2UEp3c3F2VjVLSjY3a1pkbnRUa3dybVM4K0hBNmVRejJYMGJ4WURuL0lKbmFkUnBtcVkxawo4S2t4d3RSLzNKbmR4a29yL0NhdnJHQzR4a0Z2TUxLT0pkUHZtV0MxdG9NUEszOU5kRTF4OVo3K1lycVBSYjRmCnp6K2ErMmFQY2kyNGhKcmsySm8xV2NVN2Y3Z29mRGNKY0lwNGJUTUVGUHMxaS95WkQycDY4RVlTZzhhUjZvVzIKT011WE1mREtkMk9PRWVxdTR0MnhScUl3SlhuTFJNdkJKRkw5NmMwSDcyVGEwRXdnOHVudmhrZmFZR2NzYTV1TwpVTzRWZFdmYUtCVmVsYytpcC91MjVxZXNvTnp6QWdNQkFBR2pXVEJYTUE0R0ExVWREd0VCL3dRRUF3SUNwREFQCkJnTlZIUk1CQWY4RUJUQURBUUgvTUIwR0ExVWREZ1FXQkJUNmVqdEs5T01zR0pXclIvU1I1QnlOVHZOejhEQVYKQmdOVkhSRUVEakFNZ2dwcmRXSmxjbTVsZEdWek1BMEdDU3FHU0liM0RRRUJDd1VBQTRJQkFRQkoxa044bCtuMwpnV1V0RXlMb2ZJWjVDZG1kOGJXNXpZMkRTeWZpYzhNcXlOQUhPd3N5anp1T21iNndvZlErS1pGMWR1SnpUVnF1CmxXSFBQdE1hc2l5U3JSbEJRSEtXV2IzdVJNeG8zeG1SWExvRW1kSmQ1S1F0SDYvUnNSS0pId29KVVNOWmVITFYKS1R4QTcwWEtRakVUYkROenQ4c0JUUkxVM1lMV0JnTE96RVRzSm5DeVV0ZGZsVm82U3Qwemc3NmJGV3pxMXhqMQora3o0Mzhhd2paV1Z2THNBU0dRNHFkT0hvM1NscUgrUnlJb3U5bGFvOVdKOTEzYi90QWxyQ3lOdzVVTURZWHFWCllCQk9PLzdkeS9BRDdxNDFyTk9rUFFHclQ4MkswUjg3ZWgzeXJXSG9FcSt0N2pvdktrdlFvL2IyT3VhZTV3YnEKZmtRczZpbG92bHptCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K
5+
server: https://icadmin011.iccluster.epfl.ch:6443
6+
name: ic-cluster
7+
- cluster:
8+
certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUMvakNDQWVhZ0F3SUJBZ0lCQURBTkJna3Foa2lHOXcwQkFRc0ZBREFWTVJNd0VRWURWUVFERXdwcmRXSmwKY201bGRHVnpNQjRYRFRJek1EUXlOakE0TVRRME5sb1hEVE16TURReU16QTRNVFEwTmxvd0ZURVRNQkVHQTFVRQpBeE1LYTNWaVpYSnVaWFJsY3pDQ0FTSXdEUVlKS29aSWh2Y05BUUVCQlFBRGdnRVBBRENDQVFvQ2dnRUJBTFAxCmtTZ2E4NWRWU0p0VUxGQ1g5VWo1K1lTT2dCbG9MZGVxZVgrM1ByVGtQZkptWFBxeXlsVVBLN0tJUWlvSUplNm8KRTBaS2JZbU03SnEvL0lPaHF4R0VraUNrTHJCamJrYXF5M3NibkNhWGFMa1pQYkhNWjgwdmlMMGNFZHNJTWN4WgozdHpMTzFNTldwZW9mZlJ6L1NvbXpqSTVDQldJbUptTmhvZXpJQUVNOGJuaDJKeFBFNzRwWThTS1BTRk5YVzN0CjgxNmM5cXRvc1lJQjVrTnh1UjRGWVh5bGloZHZ3UmVqVW9wajA2ME1rSkl3QmpXM01YTFUrdkVyandKeFc5Q1cKZ2plUndzOG5kdW5VVHREcy9CVjhGbW5JZy81VVNhZTBzUE5FQWxvZC9TbGhrMnNuWTJvUXZlTHpFNkhrMnluRgpHNXd1VGVXRDZGY2Erd1pNMjM4Q0F3RUFBYU5aTUZjd0RnWURWUjBQQVFIL0JBUURBZ0trTUE4R0ExVWRFd0VCCi93UUZNQU1CQWY4d0hRWURWUjBPQkJZRUZNVVhkVWVnK2xMdTlHWElMQ2VlOVJzOENmUXpNQlVHQTFVZEVRUU8KTUF5Q0NtdDFZbVZ5Ym1WMFpYTXdEUVlKS29aSWh2Y05BUUVMQlFBRGdnRUJBR051a2ZUR3E0RTlrckkreVZQbApaem1reSszaUNTMnYvTU9OU3h0S01idWZ2V0ROZFM3QzZaK1RDQTJSd0c1Y2gzZUh5UW9oTSs0K2wrSTJxMTFwCjNJVGRxYVI4RDhpQkFCbXV6Yzl2a3BKanZTTzZ4VVpnTFJZMHRDTUxXZ3g2b2tBcWhxZDV3YTZIYmN6Z1QrSUcKQlVGbERtR0R4K0MxTnFIYVFKUVN1bENqL1ZyS1RROVFlY1NoZGZqVDgvS1NVUjQ4VTlEdlA3dnU0YkRnWW5DKwpoOXEwUlFpUGR4TEtlL2Q5aGd0UnM5TjFQdGRYZXAxdHB3NCs3Y3N4TE1DSXNmYTBwaW8yb3lEems0bTNjSWRNCi9iNElHUEZaM2hYZktOVGtybnUrWmdCUms5Yjk3emNKZVdhendxTXUyd1dkV2JiQjdpaU5ZK2xtWkl1S0dUeFQKWWpRPQotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0tCg==
9+
server: https://caas-test.rcp.epfl.ch:443
10+
name: rcp-cluster
11+
contexts:
12+
- context:
13+
cluster: ic-cluster
14+
user: ic-runai-authenticated-user
15+
name: ic-context
16+
- context:
17+
cluster: rcp-cluster
18+
user: rcp-runai-authenticated-user
19+
name: rcp-context
20+
current-context: ic-context
21+
kind: Config
22+
preferences: {}
23+
users:
24+
- name: ic-runai-authenticated-user
25+
user:
26+
auth-provider:
27+
config:
28+
airgapped: "true"
29+
auth-flow: remote-browser
30+
client-id: runai-cli-sso
31+
idp-issuer-url: https://app.run.ai/auth/realms/EPFL
32+
realm: EPFL
33+
redirect-uri: https://epfl.run.ai/oauth-code
34+
name: oidc
35+
- name: rcp-runai-authenticated-user
36+
user:
37+
auth-provider:
38+
config:
39+
airgapped: "true"
40+
auth-flow: remote-browser
41+
client-id: runai-cli
42+
idp-issuer-url: https://app.run.ai/auth/realms/rcpepfl
43+
realm: rcpepfl
44+
redirect-uri: https://rcpepfl.run.ai/oauth-code
45+
name: oidc

Diff for: publish.sh

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
3+
BASE=mlo
4+
USER=mlo # Change this to your GASPAR
5+
TAG=v1 # Change this to the version you want to publish
6+
7+
REGISTRY=ic-registry.epfl.ch
8+
# REGISTRY=docker.io
9+
10+
docker build . -t ${BASE}/${USER}:${TAG} --network=host &&
11+
docker tag ${BASE}/${USER}:${TAG} ${REGISTRY}/${BASE}/${USER}:${TAG} &&
12+
docker push ${REGISTRY}/${BASE}/${USER}:${TAG}

Diff for: template/user_template.yaml

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#DO NOT CHANGE THIS FILE, CREATE A COPY IN THE ROOT FOLDER OF THE REPO
2+
user: <enter username>
3+
uid: <enter uid>
4+
group: MLO-unit
5+
gid: <enter gid>
6+
wandb_api_key: <enter api key>
7+
working_dir: /mloscratch/homes/<enter username>
8+
symlinks: # list of symlinks to create (enter 'dir' or 'file' and the path)')
9+
.ssh: [dir, .ssh]
10+
.bashrc: [file, .bashrc]
11+
.profile: [file, .profile]
12+
.zshrc: [file, .zshrc]
13+
.oh-my-zsh: [dir, .oh-my-zsh] # installed inside docker + entrypoint on first run
14+
.zsh_history: [file, .zsh_history]
15+
.gitconfig: [file, .gitconfig]
16+
.vscode: [dir, .vscode]
17+
.vscode-server: [dir, .vscode-server]
18+
conda: [conda, conda]

Diff for: utils/conda.sh

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
sudo apt update &&
2+
~/conda/bin/conda update -n base -c defaults conda -y &&
3+
~/conda/bin/conda install -c nvidia -c defaults -c intel -c conda-forge --file /docker/extra_packages.txt --all -y &&
4+
~/conda/bin/conda init zsh &&
5+
~/conda/bin/conda create -n torch pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia -y &&
6+
~/conda/bin/conda install -n torch -c nvidia -c defaults -c intel -c conda-forge --file /docker/extra_packages.txt --all -y &&
7+
~/conda/envs/torch/bin/pip install --upgrade tensorflow tensorflow-datasets --no-cache-dir &&
8+
~/conda/bin/conda clean --all -y &&
9+
source ~/conda/bin/activate &&
10+
. ~/.zshrc

0 commit comments

Comments
 (0)