Skip to content

Commit a29a799

Browse files
committed
Fix worker fault reporting
1 parent 5769b09 commit a29a799

File tree

2 files changed

+42
-15
lines changed

2 files changed

+42
-15
lines changed

src/sch/Sch.jl

+38-9
Original file line numberDiff line numberDiff line change
@@ -225,11 +225,9 @@ end
225225
function cleanup(ctx)
226226
end
227227

228-
function _init_proc(uid)
229-
lock(ACTIVE_TASKS_LOCK) do
230-
@assert !haskey(ACTIVE_TASKS, uid)
231-
end
232-
end
228+
const WORKER_MONITOR_LOCK = Threads.ReentrantLock()
229+
const WORKER_MONITOR_TASKS = Dict{Int,Task}()
230+
const WORKER_MONITOR_CHANS = Dict{Int,Dict{UInt64,RemoteChannel}}()
233231
function init_proc(state, p)
234232
# Initialize pressure and capacity
235233
proc = OSProc(p.pid)
@@ -251,7 +249,31 @@ function init_proc(state, p)
251249
state.worker_capacity[p.pid] = cap
252250
end
253251
end
254-
# TODO: remotecall_fetch(_init_proc, p.pid, state.uid)
252+
lock(WORKER_MONITOR_LOCK) do
253+
wid = p.pid
254+
if !haskey(WORKER_MONITOR_TASKS, wid)
255+
t = @async begin
256+
try
257+
# Wait until this connection is terminated
258+
remotecall_fetch(sleep, wid, typemax(UInt64))
259+
catch err
260+
if err isa ProcessExitedException
261+
lock(WORKER_MONITOR_LOCK) do
262+
d = WORKER_MONITOR_CHANS[wid]
263+
for uid in keys(d)
264+
put!(d[uid], (wid, OSProc(wid), nothing, (ProcessExitedException(wid), nothing)))
265+
end
266+
empty!(d)
267+
delete!(WORKER_MONITOR_CHANS, wid)
268+
end
269+
end
270+
end
271+
end
272+
WORKER_MONITOR_TASKS[wid] = t
273+
WORKER_MONITOR_CHANS[wid] = Dict{UInt64,RemoteChannel}()
274+
end
275+
WORKER_MONITOR_CHANS[wid][state.uid] = state.chan
276+
end
255277

256278
# Setup worker-to-scheduler channels
257279
inp_chan = RemoteChannel(p.pid)
@@ -261,10 +283,16 @@ function init_proc(state, p)
261283
end
262284
end
263285
function _cleanup_proc(uid)
264-
empty!(CHUNK_CACHE)
286+
empty!(CHUNK_CACHE) # FIXME: Should be keyed on uid!
265287
end
266288
function cleanup_proc(state, p)
267-
remote_do(_cleanup_proc, p.pid, state.uid)
289+
lock(WORKER_MONITOR_LOCK) do
290+
wid = p.pid
291+
if haskey(WORKER_MONITOR_CHANS, wid)
292+
delete!(WORKER_MONITOR_CHANS[wid], state.uid)
293+
remote_do(_cleanup_proc, wid, state.uid)
294+
end
295+
end
268296
end
269297

270298
"Process-local count of actively-executing Dagger tasks per processor type."
@@ -367,7 +395,7 @@ function compute_dag(ctx, d::Thunk; options=SchedulerOptions())
367395
remove_dead_proc!(ctx, state, gproc)
368396

369397
lock(state.lock) do
370-
handle_fault(ctx, state, state.thunk_dict[thunk_id], gproc)
398+
handle_fault(ctx, state, gproc)
371399
end
372400
continue
373401
else
@@ -403,6 +431,7 @@ function compute_dag(ctx, d::Thunk; options=SchedulerOptions())
403431

404432
safepoint(state)
405433
end
434+
state.halt[] = true
406435
@sync for p in procs_to_use(ctx)
407436
@async cleanup_proc(state, p)
408437
end

src/sch/fault-handler.jl

+4-6
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,11 @@ of DAGs, it *may* cause a `KeyError` or other failures in the scheduler due to
1313
the complexity of getting the internal state back to a consistent and proper
1414
state.
1515
"""
16-
function handle_fault(ctx, state, thunk, oldproc)
16+
function handle_fault(ctx, state, oldproc)
1717
# Find thunks whose results were cached on the dead worker and place them
1818
# on what's called a "deadlist". This structure will direct the recovery
1919
# of the scheduler's state.
20-
deadlist = Thunk[thunk]
21-
# This thunk is guaranteed to not have valid cached data anymore
22-
thunk.cache = false
23-
thunk.cache_ref = nothing
20+
deadlist = Thunk[]
2421
for t in keys(state.cache)
2522
v = state.cache[t]
2623
if v isa Chunk && v.handle isa DRef && v.handle.owner == oldproc.pid
@@ -37,6 +34,7 @@ function handle_fault(ctx, state, thunk, oldproc)
3734
end
3835
end
3936
# TODO: Find *all* thunks who were actively running on the dead worker
37+
# TODO: Set thunk.cache to nothing
4038

4139
# Empty cache of dead thunks
4240
for ct in keys(state.cache)
@@ -72,7 +70,7 @@ function handle_fault(ctx, state, thunk, oldproc)
7270
fix_waitdicts!(state, deadlist, ot)
7371
end
7472

75-
fix_waitdicts!(state, deadlist, thunk)
73+
#fix_waitdicts!(state, deadlist, thunk)
7674

7775
# Remove thunks from state.ready that have inputs on the deadlist
7876
for idx in length(state.ready):-1:1

0 commit comments

Comments
 (0)