Skip to content

Commit 4612325

Browse files
committed
initial commit
0 parents  commit 4612325

File tree

6 files changed

+158
-0
lines changed

6 files changed

+158
-0
lines changed

Diff for: LICENSE

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2020 Joseph Kleinhenz
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

Diff for: Manifest.toml

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# This file is machine-generated - editing it directly is not advised
2+
3+
[[Distributed]]
4+
deps = ["Random", "Serialization", "Sockets"]
5+
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
6+
7+
[[Random]]
8+
deps = ["Serialization"]
9+
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
10+
11+
[[Serialization]]
12+
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
13+
14+
[[Sockets]]
15+
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"

Diff for: Project.toml

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
name = "SlurmClusterManager"
2+
uuid = "c82cd089-7bf7-41d7-976b-6b5d413cbe0a"
3+
authors = ["Joseph Kleinhenz <[email protected]>"]
4+
version = "0.1.0"
5+
6+
[deps]
7+
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"

Diff for: README.md

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# SlurmClusterManager.jl
2+
3+
This package provides support for using julia within the Slurm cluster environment.
4+
The code is adapted from [ClusterManagers.jl](https://github.com/JuliaParallel/ClusterManagers.jl) with some modifications.
5+
6+
## Usage
7+
8+
This script uses all resources from a Slurm allocation as julia workers and prints the id and hostname on each one.
9+
10+
```jl
11+
#!/usr/bin/env julia
12+
13+
using Distributed, SlurmClusterManager
14+
addprocs(SlurmManager())
15+
@everywhere println("hello from $(myid()):$(gethostname())")
16+
```
17+
18+
If the code is saved in `script.jl` it can be queued and executed on two nodes using 64 workers per node by running
19+
20+
```
21+
sbatch -N 2 --ntasks-per-node=64 script.jl
22+
```
23+
24+
## Differences from `ClusterManagers.jl`
25+
26+
* Only supports Slurm (see this [issue](https://github.com/JuliaParallel/ClusterManagers.jl/issues/58) for some background).
27+
* Requires that `SlurmManager` be created inside a Slurm allocation created by sbatch/salloc.
28+
Specifically `SLURM_JOBID` and `SLURM_NTASKS` must be defined in order to construct `SlurmManager`.
29+
This matches typical HPC workflows where resources are requested using sbatch and then used by the application code.
30+
In contrast `ClusterManagers.jl` will *dynamically* request resources when run outside of an existing Slurm allocation.
31+
I found that this was basically never what I wanted since this leaves the manager process running on a login node,
32+
and makes the script wait until resources are granted which is better handled by the actual Slurm queueing system.
33+
* Does not take any Slurm arguments. All Slurm arguments are inherited from the external Slurm allocation created by sbatch/salloc.
34+
* Output from workers is redirected to the manager process instead of requiring a separate output file for every task.

Diff for: src/SlurmClusterManager.jl

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
module SlurmClusterManager
2+
3+
export SlurmManager, launch, manage
4+
5+
using Distributed
6+
import Distributed: launch, manage
7+
8+
include("slurmmanager.jl")
9+
10+
end # module

Diff for: src/slurmmanager.jl

+71
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
"""
2+
ClusterManager for a Slurm allocation
3+
4+
Represents the resources available within a slurm allocation created by salloc/sbatch.
5+
The environment variables `SLURM_JOBID` and `SLURM_NTASKS` must be defined to construct this object.
6+
"""
7+
struct SlurmManager <: ClusterManager
8+
jobid::Int
9+
ntasks::Int
10+
verbose::Bool
11+
launch_timeout::Float64
12+
13+
function SlurmManager(;verbose=false, launch_timeout=60.0)
14+
@assert "SLURM_JOBID" in keys(ENV)
15+
@assert "SLURM_NTASKS" in keys(ENV)
16+
17+
jobid = parse(Int, ENV["SLURM_JOBID"])
18+
ntasks = parse(Int, ENV["SLURM_NTASKS"])
19+
20+
new(jobid, ntasks, verbose, launch_timeout)
21+
end
22+
end
23+
24+
function launch(manager::SlurmManager, params::Dict, instances_arr::Array, c::Condition)
25+
try
26+
exehome = params[:dir]
27+
exename = params[:exename]
28+
exeflags = params[:exeflags]
29+
30+
srun_cmd = `srun -D $exehome $exename $exeflags --worker=$(cluster_cookie())`
31+
srun_proc = open(srun_cmd)
32+
33+
t = @async for i in 1:manager.ntasks
34+
manager.verbose && println("connecting to worker $i out of $(manager.ntasks)")
35+
36+
line = readline(srun_proc)
37+
m = match(r".*:(\d*)#(.*)", line)
38+
m === nothing && error("could not parse $line")
39+
40+
config = WorkerConfig()
41+
config.port = parse(Int, m[1])
42+
config.host = strip(m[2])
43+
44+
# Keep a reference to the proc, so it's properly closed once the last worker exits.
45+
config.userdata = srun_proc
46+
push!(instances_arr, config)
47+
notify(c)
48+
end
49+
50+
# workers must be launched before timeout otherwise interrupt
51+
status = timedwait(() -> istaskdone(t), manager.launch_timeout)
52+
if status !== :ok
53+
@async Base.throwto(t, ErrorException("launch_timeout exceeded"))
54+
end
55+
wait(t)
56+
57+
# redirect output
58+
@async while !eof(srun_proc)
59+
line = readline(srun_proc)
60+
println(line)
61+
end
62+
63+
catch e
64+
println("Error launching Slurm job:")
65+
rethrow(e)
66+
end
67+
end
68+
69+
function manage(manager::SlurmManager, id::Integer, config::WorkerConfig, op::Symbol)
70+
# This function needs to exist, but so far we don't do anything
71+
end

0 commit comments

Comments
 (0)