Skip to content

Commit 6d2b850

Browse files
committed
Add the "auto-detect the current allocation" feature
1 parent c50cff7 commit 6d2b850

File tree

4 files changed

+154
-0
lines changed

4 files changed

+154
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Manifest.toml

Project.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,18 @@ version = "1.0.0"
44

55
[deps]
66
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
7+
LSFClusterManager = "af02cf76-cbe3-4eeb-96a8-af9391005858"
78
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
89
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
10+
SlurmClusterManager = "c82cd089-7bf7-41d7-976b-6b5d413cbe0a"
911
Sockets = "6462fe0b-24de-5631-8697-dd941f90decc"
1012

1113
[compat]
1214
Distributed = "< 0.0.1, 1"
15+
LSFClusterManager = "1.0.0"
1316
Logging = "< 0.0.1, 1"
1417
Pkg = "< 0.0.1, 1"
18+
SlurmClusterManager = "0.1.3"
1519
Sockets = "< 0.0.1, 1"
1620
julia = "1.2"
1721

src/ClusterManagers.jl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,24 @@ using Distributed
44
using Sockets
55
using Pkg
66

7+
import LSFClusterManager
8+
import SlurmClusterManager
9+
710
export launch, manage, kill, init_worker, connect
811
import Distributed: launch, manage, kill, init_worker, connect
912

13+
# Bring some other names into scope, just for convenience:
14+
using Distributed: addprocs
15+
1016
worker_cookie() = begin Distributed.init_multi(); cluster_cookie() end
1117
worker_arg() = `--worker=$(worker_cookie())`
1218

1319

1420
# PBS doesn't have the same semantics as SGE wrt to file accumulate,
1521
# a different solution will have to be found
1622
include("qsub.jl")
23+
24+
include("auto_detect.jl")
1725
include("scyld.jl")
1826
include("condor.jl")
1927
include("slurm.jl")

src/auto_detect.jl

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
function addprocs_autodetect_current_scheduler(; kwargs...)
2+
sched = autodetect_current_scheduler()
3+
4+
if sched == :slurm
5+
res = Distributed.addprocs(SlurmClusterManager.SlurmManager(); kwargs...)
6+
7+
elseif sched == :lsf
8+
np = _lsf_get_numtasks()
9+
res = LSFClusterManager.addprocs_lsf(np; kwargs...)
10+
11+
elseif sched == :sge
12+
np = _sge_get_number_of_tasks()
13+
res = addprocs_sge(np; kwargs...)
14+
15+
elseif sched == :pbs
16+
np = _torque_get_numtasks()
17+
res = addprocs_pbs(np; kwargs...)
18+
19+
else
20+
error("Unable to auto-detect cluster scheduler: $(sched)")
21+
end
22+
23+
return res
24+
end
25+
26+
function autodetect_current_scheduler()
27+
if _autodetect_is_slurm()
28+
return :slurm
29+
elseif _autodetect_is_lsf()
30+
return :lsf
31+
elseif _autodetect_is_sge()
32+
return :sge
33+
elseif _autodetect_is_pbs()
34+
return :pbs
35+
end
36+
return nothing
37+
end
38+
39+
##### Slurm:
40+
41+
function _autodetect_is_slurm()
42+
has_SLURM_JOB_ID = _has_env_nonempty("SLURM_JOB_ID")
43+
has_SLURM_JOBID = _has_env_nonempty("SLURM_JOBID")
44+
res = has_SLURM_JOB_ID || has_SLURM_JOBID
45+
return res
46+
end
47+
48+
##### LSF:
49+
50+
function _autodetect_is_lsf()
51+
# https://www.ibm.com/docs/en/spectrum-lsf/10.1.0?topic=variables-environment-set-job-execution
52+
has_LSB_JOBNAME = _has_env_nonempty("LSB_JOBNAME")
53+
return has_LSB_JOBNAME
54+
end
55+
56+
function _lsf_get_numtasks()
57+
# https://www.ibm.com/docs/en/spectrum-lsf/10.1.0?topic=variables-environment-variable-reference
58+
#
59+
# See also:
60+
# https://portal.supercomputing.wales/index.php/index/slurm/lsf-to-slurm-ref/
61+
name = "LSB_DJOB_NUMPROC"
62+
value_str = strip(ENV[name])
63+
value_int = _getenv_parse_int(name)
64+
return value_int
65+
end
66+
67+
##### SGE (Sun Grid Engine):
68+
69+
function _autodetect_is_sge()
70+
# https://docs.oracle.com/cd/E19957-01/820-0699/chp4-21/index.html
71+
has_SGE_O_HOST = _has_env_nonempty("SGE_O_HOST")
72+
return has_SGE_O_HOST
73+
74+
# Important note:
75+
# The "job ID" environment variable in SGE is just named `JOB_ID`.
76+
# This is obviously too vague, because the variable name is not specific to SGE.
77+
# Therefore, we can't use that variable for our SGE auto-detection.
78+
end
79+
80+
function _sge_get_numtasks()
81+
msg = "Because this is Sun Grid Engine (SGE), ClusterManagers.jl is not able " *
82+
"to correctly auto-detect the number of tasks. " *
83+
"Therefore, ClusterManagers.jl will instead use the value of the " *
84+
"NHOSTS environment variable: $(np)"
85+
@warn msg
86+
87+
# https://docs.oracle.com/cd/E19957-01/820-0699/chp4-21/index.html
88+
name = "NHOSTS"
89+
value_int = _getenv_parse_int(name)
90+
return value_int
91+
end
92+
93+
##### PBS and Torque:
94+
95+
function _autodetect_is_pbs()
96+
# https://docs.adaptivecomputing.com/torque/2-5-12/help.htm#topics/2-jobs/exportedBatchEnvVar.htm
97+
has_PBS_JOBID = _has_env_nonempty("PBS_JOBID")
98+
return has_PBS_JOBID
99+
end
100+
101+
function _torque_get_numtasks()
102+
# https://docs.adaptivecomputing.com/torque/2-5-12/help.htm#topics/2-jobs/exportedBatchEnvVar.htm
103+
name = "PBS_TASKNUM"
104+
value_int = _getenv_parse_int(name)
105+
return value_int
106+
107+
@info "Using auto-detected num_tasks: $(np)"
108+
end
109+
110+
##### General utility functions:
111+
112+
function _has_env_nonempty(name::AbstractString)
113+
stripped_value = strip(get(ENV, name, ""))
114+
res_b = !isempty(stripped_value)
115+
return res_b
116+
end
117+
118+
function _getenv_parse_int(name::AbstractString)
119+
if !haskey(ENV, name)
120+
msg = "Environment variable is not defined: $(name)"
121+
error(msg)
122+
end
123+
original_value = ENV[name]
124+
if isempty(original_value)
125+
msg = "Environment variable is defined, but is empty: $(name)"
126+
error(msg)
127+
end
128+
stripped_value_str = strip(original_value)
129+
if isempty(stripped_value)
130+
msg = "Environment variable is defined, but contains only whitespace: $(name)"
131+
error(msg)
132+
end
133+
value_int = tryparse(Int, stripped_value_str)
134+
if !(value_int isa Int)
135+
msg =
136+
"Environment variable \"$(name)\" is defined, " *
137+
"but its value \"$(stripped_value_str)\" could not be parsed as an integer."
138+
error(msg)
139+
end
140+
return value_int
141+
end

0 commit comments

Comments
 (0)