@@ -6,6 +6,11 @@ import Logging.@warn
6
6
7
7
struct SlurmManager <: ClusterManager
8
8
np:: Integer
9
+ retry_delays
10
+ end
11
+
12
+ struct SlurmException <: Exception
13
+ msg
9
14
end
10
15
11
16
function launch (manager:: SlurmManager , params:: Dict , instances_arr:: Array ,
@@ -54,12 +59,13 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array,
54
59
srun_cmd = ` srun -J $jobname -n $np -o "$(job_output_template) " -D $exehome $(srunargs) $exename $exeflags $(worker_arg ()) `
55
60
srun_proc = open (srun_cmd)
56
61
slurm_spec_regex = r" ([\w ]+):([\d ]+)#(\d {1,3}.\d {1,3}.\d {1,3}.\d {1,3})"
62
+ retry_delays = manager. retry_delays
57
63
for i = 0 : np - 1
58
64
println (" connecting to worker $(i + 1 ) out of $np " )
59
65
slurm_spec_match = nothing
60
66
fn = make_job_output_path (lpad (i, 4 , " 0" ))
61
67
t0 = time ()
62
- while true
68
+ for retry_delay in retry_delays
63
69
# Wait for output log to be created and populated, then parse
64
70
if isfile (fn) && filesize (fn) > 0
65
71
slurm_spec_match = open (fn) do f
@@ -77,8 +83,13 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array,
77
83
end
78
84
end
79
85
# Sleep for some time to limit ressource usage while waiting for the job to start
80
- sleep (0.1 )
86
+ sleep (retry_delay)
87
+ end
88
+
89
+ if slurm_spec_match === nothing
90
+ throw (SlurmException (" Timeout while trying to connect to worker" ))
81
91
end
92
+
82
93
config = WorkerConfig ()
83
94
config. port = parse (Int, slurm_spec_match[2 ])
84
95
config. host = strip (slurm_spec_match[3 ])
@@ -99,4 +110,21 @@ function manage(manager::SlurmManager, id::Integer, config::WorkerConfig,
99
110
# This function needs to exist, but so far we don't do anything
100
111
end
101
112
102
- addprocs_slurm (np:: Integer ; kwargs... ) = addprocs (SlurmManager (np); kwargs... )
113
+ SlurmManager (np:: Integer ) = SlurmManager (np, ExponentialBackOff (n= 10 , first_delay= 1 ,
114
+ max_delay= 512 , factor= 2 ))
115
+
116
+ """
117
+ Launch `np` workers on a cluster managed by slurm. `retry_delays` is a vector of
118
+ numbers specifying in seconds how long to repeatedly wait for a worker to start.
119
+ Defaults to an exponential backoff.
120
+
121
+ # Examples
122
+
123
+ ```
124
+ addprocs_slurm(100; retry_delays=Iterators.repeated(0.1))
125
+ ```
126
+ """
127
+ addprocs_slurm (np:: Integer ;
128
+ retry_delays= ExponentialBackOff (n= 10 , first_delay= 1 ,
129
+ max_delay= 512 , factor= 2 ),
130
+ kwargs... ) = addprocs (SlurmManager (np, retry_delays); kwargs... )
0 commit comments