|
| 1 | +#!/usr/bin/python3 |
| 2 | + |
| 3 | +import argparse |
| 4 | +from datetime import datetime, timedelta |
| 5 | +import re |
| 6 | +import subprocess |
| 7 | +import tempfile |
| 8 | +import yaml |
| 9 | +import os |
| 10 | + |
| 11 | +parser = argparse.ArgumentParser(description="Cluster Submit Utility") |
| 12 | +parser.add_argument( |
| 13 | + "-n", |
| 14 | + "--name", |
| 15 | + type=str, |
| 16 | + required=False, |
| 17 | + help="Job name (has to be unique in the namespace)", |
| 18 | +) |
| 19 | +parser.add_argument( |
| 20 | + "-c", |
| 21 | + "--command", |
| 22 | + type=str, |
| 23 | + required=False, |
| 24 | + help="Command to run on the instance (default sleep for duration)", |
| 25 | +) |
| 26 | +parser.add_argument( |
| 27 | + "-t", |
| 28 | + "--time", |
| 29 | + type=str, |
| 30 | + required=False, |
| 31 | + help="The maximum duration allowed for this job (default 24h)", |
| 32 | +) |
| 33 | +parser.add_argument( |
| 34 | + "-g", |
| 35 | + "--gpus", |
| 36 | + type=int, |
| 37 | + default=1, |
| 38 | + required=False, |
| 39 | + help="The number of GPUs requested (default 1)", |
| 40 | +) |
| 41 | +parser.add_argument( |
| 42 | + "--cpus", |
| 43 | + type=int, |
| 44 | + default=4, |
| 45 | + required=False, |
| 46 | + help="The number of CPUs requested (default 4)", |
| 47 | +) |
| 48 | +parser.add_argument( |
| 49 | + "-i", |
| 50 | + "--image", |
| 51 | + type=str, |
| 52 | + required=False, |
| 53 | + default="ic-registry.epfl.ch/mlo/mlo:v1", |
| 54 | + help="The URL of the docker image that will be used for the job", |
| 55 | +) |
| 56 | +parser.add_argument( |
| 57 | + "-p", |
| 58 | + "--port", |
| 59 | + type=int, |
| 60 | + required=False, |
| 61 | + help="A cluster port for connect to this node", |
| 62 | +) |
| 63 | +parser.add_argument( |
| 64 | + "-u", |
| 65 | + "--user", |
| 66 | + type=str, |
| 67 | + default="user.yaml", |
| 68 | + help="Path to a yaml file that defines the user", |
| 69 | +) |
| 70 | +parser.add_argument( |
| 71 | + "--train", |
| 72 | + action="store_true", |
| 73 | + help="train job (default is interactive, which has higher priority)", |
| 74 | +) |
| 75 | +parser.add_argument( |
| 76 | + "-d", |
| 77 | + "--dry", |
| 78 | + action="store_true", |
| 79 | + help="Print the generated yaml file instead of submitting it", |
| 80 | +) |
| 81 | + |
| 82 | +if __name__ == "__main__": |
| 83 | + args = parser.parse_args() |
| 84 | + |
| 85 | + if not os.path.exists(args.user): |
| 86 | + print( |
| 87 | + f"User file {args.user} does not exist, use the template in `template/user.yaml` to create your user file." |
| 88 | + ) |
| 89 | + exit(1) |
| 90 | + |
| 91 | + with open(args.user, "r") as file: |
| 92 | + user_cfg = yaml.safe_load(file) |
| 93 | + |
| 94 | + if args.name is None: |
| 95 | + args.name = f"{user_cfg['user']}-{datetime.now().strftime('%Y%m%d-%H%M%S')}" |
| 96 | + |
| 97 | + if args.time is None: |
| 98 | + args.time = 7 * 24 * 60 * 60 |
| 99 | + else: |
| 100 | + pattern = r"((?P<days>\d+)d)?((?P<hours>\d+)h)?((?P<minutes>\d+)m)?((?P<seconds>\d+)s?)?" |
| 101 | + match = re.match(pattern, args.time) |
| 102 | + parts = {k: int(v) for k, v in match.groupdict().items() if v} |
| 103 | + args.time = int(timedelta(**parts).total_seconds()) |
| 104 | + |
| 105 | + if args.command is None: |
| 106 | + args.command = f"sleep {args.time}" |
| 107 | + |
| 108 | + if args.train: |
| 109 | + comment_out_priority = "#" # comment |
| 110 | + else: |
| 111 | + comment_out_priority = "" |
| 112 | + |
| 113 | + working_dir = user_cfg["working_dir"] |
| 114 | + symlink_targets, symlink_destinations = zip(*user_cfg["symlinks"].items()) |
| 115 | + symlink_targets = ":".join( |
| 116 | + [os.path.join(working_dir, target) for target in symlink_targets] |
| 117 | + ) |
| 118 | + symlink_paths = ":".join( |
| 119 | + [ |
| 120 | + os.path.join(f"/home/{user_cfg['user']}", dest[1]) |
| 121 | + for dest in symlink_destinations |
| 122 | + ] |
| 123 | + ) |
| 124 | + symlink_types = ":".join([dest[0] for dest in symlink_destinations]) |
| 125 | + |
| 126 | + cfg = f""" |
| 127 | +# Source: runaijob/templates/runai-job.yaml |
| 128 | +apiVersion: run.ai/v1 |
| 129 | +kind: RunaiJob |
| 130 | +metadata: |
| 131 | + name: {args.name} |
| 132 | + labels: |
| 133 | + {comment_out_priority}priorityClassName: "build" # Interactive Job if present, for Train Job REMOVE this line |
| 134 | + user: {user_cfg['user']} |
| 135 | +spec: |
| 136 | + template: |
| 137 | + metadata: |
| 138 | + labels: |
| 139 | + user: {user_cfg['user']} |
| 140 | + spec: |
| 141 | + hostIPC: true |
| 142 | + schedulerName: runai-scheduler |
| 143 | + restartPolicy: Never |
| 144 | + securityContext: |
| 145 | + runAsUser: {user_cfg['uid']} |
| 146 | + runAsGroup: {user_cfg['gid']} |
| 147 | + containers: |
| 148 | + - name: {args.name} |
| 149 | + image: {args.image} |
| 150 | + imagePullPolicy: Always |
| 151 | + workingDir: "/home/{user_cfg['user']}" |
| 152 | + securityContext: |
| 153 | + allowPrivilegeEscalation: true |
| 154 | + stdin: |
| 155 | + tty: |
| 156 | + args: [ |
| 157 | + "/bin/bash", |
| 158 | + "-c", |
| 159 | + "{args.command}", |
| 160 | + ] |
| 161 | + env: |
| 162 | + - name: HOME |
| 163 | + value: "/home/{user_cfg['user']}" |
| 164 | + - name: NB_USER |
| 165 | + value: {user_cfg['user']} |
| 166 | + - name: NB_UID |
| 167 | + value: "{user_cfg['uid']}" |
| 168 | + - name: NB_GROUP |
| 169 | + value: {user_cfg['group']} |
| 170 | + - name: NB_GID |
| 171 | + value: "{user_cfg['gid']}" |
| 172 | + - name: WORKING_DIR |
| 173 | + value: "{working_dir}" |
| 174 | + - name: SYMLINK_TARGETS |
| 175 | + value: "{symlink_targets}" |
| 176 | + - name: SYMLINK_PATHS |
| 177 | + value: "{symlink_paths}" |
| 178 | + - name: SYMLINK_TYPES |
| 179 | + value: "{symlink_types}" |
| 180 | + - name: WANDB_API_KEY |
| 181 | + value: {user_cfg['wandb_api_key']} |
| 182 | + - name: EPFML_LDAP |
| 183 | + value: {user_cfg['user']} |
| 184 | + resources: |
| 185 | + limits: |
| 186 | + nvidia.com/gpu: {args.gpus} |
| 187 | + requests: |
| 188 | + cpu: {args.cpus} |
| 189 | + volumeMounts: |
| 190 | + - mountPath: /mloscratch |
| 191 | + name: mloscratch |
| 192 | + - mountPath: /dev/shm # Increase shared memory size |
| 193 | + name: dshm |
| 194 | + ports: |
| 195 | + - protocol: 'TCP' |
| 196 | + containerPort: 22 |
| 197 | + volumes: |
| 198 | + - name: mloscratch |
| 199 | + persistentVolumeClaim: |
| 200 | + claimName: runai-mlo-{user_cfg['user']}-scratch |
| 201 | + - name: dshm # Increase the shared memory size |
| 202 | + emptyDir: |
| 203 | + medium: Memory |
| 204 | + # nodeSelector: |
| 205 | + # run.ai/type: G10 |
| 206 | +""" |
| 207 | + |
| 208 | + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml") as f: |
| 209 | + f.write(cfg) |
| 210 | + f.flush() |
| 211 | + if args.dry: |
| 212 | + print(cfg) |
| 213 | + else: |
| 214 | + result = subprocess.run( |
| 215 | + ["kubectl", "create", "-f", f.name], |
| 216 | + # check=True, |
| 217 | + capture_output=True, |
| 218 | + # text=True, |
| 219 | + ) |
| 220 | + print(result.stdout) |
| 221 | + print(result.stderr) |
| 222 | + |
| 223 | + print("\nThe following commands may come in handy:") |
| 224 | + print(f"runai bash {args.name} - opens an interactive shell on the pod") |
| 225 | + print( |
| 226 | + f"runai delete job {args.name} - kills the job and removes it from the list of jobs" |
| 227 | + ) |
| 228 | + print( |
| 229 | + f"runai describe job {args.name} - shows information on the status/execution of the job" |
| 230 | + ) |
| 231 | + print("runai list jobs - list all jobs and their status") |
| 232 | + print(f"runai logs {args.name} - shows the output/logs for the job") |
0 commit comments