Skip to content

Commit 14e7302

Browse files
authored
[Slurm] Add retry_delays when waiting for workers (#176)
* [Slurm] Add retry_delays when waiting for workers * [Slurm] add default constructor for SlurmManager
1 parent b520580 commit 14e7302

File tree

1 file changed

+31
-3
lines changed

1 file changed

+31
-3
lines changed

src/slurm.jl

+31-3
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@ import Logging.@warn
66

77
struct SlurmManager <: ClusterManager
88
np::Integer
9+
retry_delays
10+
end
11+
12+
struct SlurmException <: Exception
13+
msg
914
end
1015

1116
function launch(manager::SlurmManager, params::Dict, instances_arr::Array,
@@ -54,12 +59,13 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array,
5459
srun_cmd = `srun -J $jobname -n $np -o "$(job_output_template)" -D $exehome $(srunargs) $exename $exeflags $(worker_arg())`
5560
srun_proc = open(srun_cmd)
5661
slurm_spec_regex = r"([\w]+):([\d]+)#(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3})"
62+
retry_delays = manager.retry_delays
5763
for i = 0:np - 1
5864
println("connecting to worker $(i + 1) out of $np")
5965
slurm_spec_match = nothing
6066
fn = make_job_output_path(lpad(i, 4, "0"))
6167
t0 = time()
62-
while true
68+
for retry_delay in retry_delays
6369
# Wait for output log to be created and populated, then parse
6470
if isfile(fn) && filesize(fn) > 0
6571
slurm_spec_match = open(fn) do f
@@ -77,8 +83,13 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array,
7783
end
7884
end
7985
# Sleep for some time to limit ressource usage while waiting for the job to start
80-
sleep(0.1)
86+
sleep(retry_delay)
87+
end
88+
89+
if slurm_spec_match === nothing
90+
throw(SlurmException("Timeout while trying to connect to worker"))
8191
end
92+
8293
config = WorkerConfig()
8394
config.port = parse(Int, slurm_spec_match[2])
8495
config.host = strip(slurm_spec_match[3])
@@ -99,4 +110,21 @@ function manage(manager::SlurmManager, id::Integer, config::WorkerConfig,
99110
# This function needs to exist, but so far we don't do anything
100111
end
101112

102-
addprocs_slurm(np::Integer; kwargs...) = addprocs(SlurmManager(np); kwargs...)
113+
SlurmManager(np::Integer) = SlurmManager(np, ExponentialBackOff(n=10, first_delay=1,
114+
max_delay=512, factor=2))
115+
116+
"""
117+
Launch `np` workers on a cluster managed by slurm. `retry_delays` is a vector of
118+
numbers specifying in seconds how long to repeatedly wait for a worker to start.
119+
Defaults to an exponential backoff.
120+
121+
# Examples
122+
123+
```
124+
addprocs_slurm(100; retry_delays=Iterators.repeated(0.1))
125+
```
126+
"""
127+
addprocs_slurm(np::Integer;
128+
retry_delays=ExponentialBackOff(n=10, first_delay=1,
129+
max_delay=512, factor=2),
130+
kwargs...) = addprocs(SlurmManager(np, retry_delays); kwargs...)

0 commit comments

Comments
 (0)