2
2
handle_fault(...)
3
3
4
4
An internal function to handle a worker dying or being killed by the OS.
5
- Attempts to determine which `Thunk`s require rescheduling based on a
6
- "deadlist", and then corrects the scheduler's internal `ComputeState` struct
7
- to recover from the fault.
5
+ Attempts to determine which `Thunk`s were running on (or had their results
6
+ cached on) the dead worker, and stores them in a "deadlist". It uses this
7
+ deadlist to correct the scheduler's internal `ComputeState` struct to recover
8
+ from the fault.
8
9
9
10
Note: The logic for this functionality is not currently perfectly robust to
10
11
all failure modes, and is only really intended as a last-ditch attempt to
@@ -13,100 +14,72 @@ of DAGs, it *may* cause a `KeyError` or other failures in the scheduler due to
13
14
the complexity of getting the internal state back to a consistent and proper
14
15
state.
15
16
"""
16
- function handle_fault (ctx, state, oldproc)
17
- # Find thunks whose results were cached on the dead worker and place them
18
- # on what's called a "deadlist". This structure will direct the recovery
19
- # of the scheduler's state.
17
+ function handle_fault (ctx, state, deadproc)
18
+ @assert ! isempty (procs (ctx)) " No workers left for fault handling!"
19
+
20
20
deadlist = Thunk[]
21
+
22
+ # Evict cache entries that were stored on the worker
21
23
for t in keys (state. cache)
22
24
v = state. cache[t]
23
- if v isa Chunk && v. handle isa DRef && v. handle. owner == oldproc . pid
25
+ if v isa Chunk && v. handle isa DRef && v. handle. owner == deadproc . pid
24
26
push! (deadlist, t)
25
- # Any inputs to dead cached thunks must be rescheduled
26
- function bfs! (deadlist, t)
27
- for input in t. inputs
28
- istask (input) || continue
29
- ! (input in deadlist) && push! (deadlist, input)
30
- bfs! (deadlist, input)
31
- end
32
- end
33
- bfs! (deadlist, t)
27
+ pop! (state. cache, t)
34
28
end
35
29
end
36
- # TODO : Find *all* thunks who were actively running on the dead worker
37
- # TODO : Set thunk.cache to nothing
30
+ # Remove thunks that were running on the worker
31
+ for t in collect (keys (state. running_on))
32
+ pid = state. running_on[t]. pid
33
+ if pid == deadproc. pid
34
+ push! (deadlist, t)
35
+ delete! (state. running_on, t)
36
+ end
37
+ end
38
+ # Clear thunk.cache_ref
39
+ for t in deadlist
40
+ t. cache_ref = nothing
41
+ end
38
42
39
- # Empty cache of dead thunks
40
- for ct in keys (state. cache)
41
- if ct in deadlist
42
- delete! (state. cache, ct)
43
+ # Remove thunks from state.ready that have inputs on the deadlist
44
+ for idx in length (state. ready): - 1 : 1
45
+ rt = state. ready[idx]
46
+ if any ((input in deadlist) for input in rt. inputs)
47
+ deleteat! (state. ready, idx)
43
48
end
44
49
end
45
50
51
+ #=
46
52
function fix_waitdicts!(state, deadlist, t::Thunk; isleaf=false)
47
53
waiting, waiting_data = state.waiting, state.waiting_data
48
- if ! (t in keys (waiting))
49
- waiting[t] = Set {Thunk} ()
50
- end
51
54
if !isleaf
52
55
# If we aren't a leaf thunk, then we may still need to recover
53
56
# further into the DAG
57
+ if !haskey(waiting, t)
58
+ waiting[t] = Set{Thunk}()
59
+ end
54
60
for input in t.inputs
55
61
istask(input) || continue
56
- @assert haskey (waiting, t) " Error: $t not in state.waiting"
57
- push! (waiting[t], input)
62
+ will_reschedule = !haskey(state.cache, input)
63
+ if will_reschedule
64
+ push!(waiting[t], input)
65
+ end
58
66
push!(waiting_data[input], t)
59
- isleaf = ! (input in deadlist)
60
- fix_waitdicts! (state, deadlist, input; isleaf= isleaf)
67
+ if will_reschedule
68
+ isleaf = !(input in deadlist)
69
+ fix_waitdicts!(state, deadlist, input; isleaf=isleaf)
70
+ end
61
71
end
62
72
end
63
73
if isempty(waiting[t])
64
74
delete!(waiting, t)
65
75
end
66
76
end
67
-
68
- # Add state.waiting deps back to state.waiting
69
- for ot in keys (state. waiting)
70
- fix_waitdicts! (state, deadlist, ot)
71
- end
72
-
73
- # fix_waitdicts!(state, deadlist, thunk)
74
-
75
- # Remove thunks from state.ready that have inputs on the deadlist
76
- for idx in length (state. ready): - 1 : 1
77
- rt = state. ready[idx]
78
- if any ((input in deadlist) for input in rt. inputs)
79
- deleteat! (state. ready, idx)
80
- end
81
- end
82
-
83
- # Remove dead thunks from state.running, and add state.running
84
- # deps back to state.waiting
85
- wasrunning = copy (state. running)
86
- empty! (state. running)
87
- while ! isempty (wasrunning)
88
- temp = pop! (wasrunning)
89
- if temp isa Thunk
90
- if ! (temp in deadlist)
91
- push! (state. running, temp)
92
- end
93
- fix_waitdicts! (state, deadlist, temp)
94
- elseif temp isa Vector
95
- newtemp = []
96
- for t in temp
97
- fix_waitdicts! (state, deadlist, t)
98
- if ! (t in deadlist)
99
- push! (newtemp, t)
100
- end
101
- end
102
- isempty (newtemp) || push! (state. running, newtemp)
103
- else
104
- throw (" Unexpected type in recovery: $temp " )
105
- end
77
+ # Fixup state.waiting and state.waiting_data
78
+ for t in deadlist
79
+ fix_waitdicts!(state, deadlist, t)
106
80
end
107
81
108
82
# Reschedule inputs from deadlist
109
- @assert ! isempty (procs (ctx)) " No workers left for fault handling!"
110
83
while length(deadlist) > 0
111
84
dt = popfirst!(deadlist)
112
85
if any((input in deadlist) for input in dt.inputs)
@@ -115,5 +88,10 @@ function handle_fault(ctx, state, oldproc)
115
88
end
116
89
push!(state.ready, dt)
117
90
end
91
+ =#
92
+ seen = Dict {Thunk,Bool} ()
93
+ for t in deadlist
94
+ reschedule_inputs! (state, t, seen)
95
+ end
118
96
schedule! (ctx, state)
119
97
end
0 commit comments