Mostly fix fault tolerance

jpsamaroo · jpsamaroo · commit 160f95909314 · 2021-05-29T08:20:28.000-04:00
diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl
@@ -51,6 +51,7 @@ Fields:
 - ready::Vector{Thunk} - The list of `Thunk`s that are ready to execute
 - cache::Dict{Thunk, Any} - Maps from a finished `Thunk` to it's cached result, often a DRef
 - running::Set{Thunk} - The set of currently-running `Thunk`s
+- running_on::Dict{Thunk,OSProc} - Map from `Thunk` to the OS process executing it
 - thunk_dict::Dict{Int, Any} - Maps from thunk IDs to a `Thunk`
 - node_order::Any - Function that returns the order of a thunk
 - worker_pressure::Dict{Int,Dict{Type,UInt}} - Cache of worker pressure
@@ -74,6 +75,7 @@ struct ComputeState
     ready::Vector{Thunk}
     cache::Dict{Thunk, Any}
     running::Set{Thunk}
+    running_on::Dict{Thunk,OSProc}
     thunk_dict::Dict{Int, Any}
     node_order::Any
     worker_pressure::Dict{Int,Dict{Type,UInt}}
@@ -98,6 +100,7 @@ function start_state(deps::Dict, node_order, chan)
                          Vector{Thunk}(undef, 0),
                          Dict{Thunk, Any}(),
                          Set{Thunk}(),
+                         Dict{Thunk,OSProc}(),
                          Dict{Int, Thunk}(),
                          node_order,
                          Dict{Int,Dict{Type,UInt}}(),
@@ -389,7 +392,7 @@ function compute_dag(ctx, d::Thunk; options=SchedulerOptions())
         thunk_failed = false
         if res isa Exception
             if unwrap_nested_exception(res) isa Union{ProcessExitedException, Base.IOError}
-                @warn "Worker $(pid) died on thunk $thunk_id, rescheduling work"
+                @warn "Worker $(pid) died, rescheduling work"
 
                 # Remove dead worker from procs list
                 remove_dead_proc!(ctx, state, gproc)
@@ -640,6 +643,7 @@ function remove_dead_proc!(ctx, state, proc, options=ctx.options)
     delete!(state.worker_capacity, proc.pid)
     delete!(state.worker_loadavg, proc.pid)
     delete!(state.worker_chans, proc.pid)
+    state.procs_cache_list[] = nothing
 end
 
 function pop_with_affinity!(ctx, tasks, proc)
@@ -684,6 +688,7 @@ end
 
 function finish_task!(ctx, state, node, thunk_failed; free=true)
     pop!(state.running, node)
+    delete!(state.running_on, node)
     if !thunk_failed
         push!(state.finished, node)
     else
@@ -759,6 +764,7 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state)
     to_send = []
     for (thunk, util) in thunks
         push!(state.running, thunk)
+        state.running_on[thunk] = gproc
         if thunk.cache && thunk.cache_ref !== nothing
             # the result might be already cached
             data = unrelease(thunk.cache_ref) # ask worker to keep the data around
diff --git a/src/sch/fault-handler.jl b/src/sch/fault-handler.jl
@@ -2,9 +2,10 @@
     handle_fault(...)
 
 An internal function to handle a worker dying or being killed by the OS.
-Attempts to determine which `Thunk`s require rescheduling based on a
-"deadlist", and then corrects the scheduler's internal `ComputeState` struct
-to recover from the fault.
+Attempts to determine which `Thunk`s were running on (or had their results
+cached on) the dead worker, and stores them in a "deadlist". It uses this
+deadlist to correct the scheduler's internal `ComputeState` struct to recover
+from the fault.
 
 Note: The logic for this functionality is not currently perfectly robust to
 all failure modes, and is only really intended as a last-ditch attempt to
@@ -13,100 +14,72 @@ of DAGs, it *may* cause a `KeyError` or other failures in the scheduler due to
 the complexity of getting the internal state back to a consistent and proper
 state.
 """
-function handle_fault(ctx, state, oldproc)
-    # Find thunks whose results were cached on the dead worker and place them
-    # on what's called a "deadlist". This structure will direct the recovery
-    # of the scheduler's state.
+function handle_fault(ctx, state, deadproc)
+    @assert !isempty(procs(ctx)) "No workers left for fault handling!"
+
     deadlist = Thunk[]
+
+    # Evict cache entries that were stored on the worker
     for t in keys(state.cache)
         v = state.cache[t]
-        if v isa Chunk && v.handle isa DRef && v.handle.owner == oldproc.pid
+        if v isa Chunk && v.handle isa DRef && v.handle.owner == deadproc.pid
             push!(deadlist, t)
-            # Any inputs to dead cached thunks must be rescheduled
-            function bfs!(deadlist, t)
-                for input in t.inputs
-                    istask(input) || continue
-                    !(input in deadlist) && push!(deadlist, input)
-                    bfs!(deadlist, input)
-                end
-            end
-            bfs!(deadlist, t)
+            pop!(state.cache, t)
         end
     end
-    # TODO: Find *all* thunks who were actively running on the dead worker
-    # TODO: Set thunk.cache to nothing
+    # Remove thunks that were running on the worker
+    for t in collect(keys(state.running_on))
+        pid = state.running_on[t].pid
+        if pid == deadproc.pid
+            push!(deadlist, t)
+            delete!(state.running_on, t)
+        end
+    end
+    # Clear thunk.cache_ref
+    for t in deadlist
+        t.cache_ref = nothing
+    end
 
-    # Empty cache of dead thunks
-    for ct in keys(state.cache)
-        if ct in deadlist
-            delete!(state.cache, ct)
+    # Remove thunks from state.ready that have inputs on the deadlist
+    for idx in length(state.ready):-1:1
+        rt = state.ready[idx]
+        if any((input in deadlist) for input in rt.inputs)
+            deleteat!(state.ready, idx)
         end
     end
 
+    #=
     function fix_waitdicts!(state, deadlist, t::Thunk; isleaf=false)
         waiting, waiting_data = state.waiting, state.waiting_data
-        if !(t in keys(waiting))
-            waiting[t] = Set{Thunk}()
-        end
         if !isleaf
             # If we aren't a leaf thunk, then we may still need to recover
             # further into the DAG
+            if !haskey(waiting, t)
+                waiting[t] = Set{Thunk}()
+            end
             for input in t.inputs
                 istask(input) || continue
-                @assert haskey(waiting, t) "Error: $t not in state.waiting"
-                push!(waiting[t], input)
+                will_reschedule = !haskey(state.cache, input)
+                if will_reschedule
+                    push!(waiting[t], input)
+                end
                 push!(waiting_data[input], t)
-                isleaf = !(input in deadlist)
-                fix_waitdicts!(state, deadlist, input; isleaf=isleaf)
+                if will_reschedule
+                    isleaf = !(input in deadlist)
+                    fix_waitdicts!(state, deadlist, input; isleaf=isleaf)
+                end
             end
         end
         if isempty(waiting[t])
             delete!(waiting, t)
         end
     end
-
-    # Add state.waiting deps back to state.waiting
-    for ot in keys(state.waiting)
-        fix_waitdicts!(state, deadlist, ot)
-    end
-
-    #fix_waitdicts!(state, deadlist, thunk)
-
-    # Remove thunks from state.ready that have inputs on the deadlist
-    for idx in length(state.ready):-1:1
-        rt = state.ready[idx]
-        if any((input in deadlist) for input in rt.inputs)
-            deleteat!(state.ready, idx)
-        end
-    end
-
-    # Remove dead thunks from state.running, and add state.running
-    # deps back to state.waiting
-    wasrunning = copy(state.running)
-    empty!(state.running)
-    while !isempty(wasrunning)
-        temp = pop!(wasrunning)
-        if temp isa Thunk
-            if !(temp in deadlist)
-                push!(state.running, temp)
-            end
-            fix_waitdicts!(state, deadlist, temp)
-        elseif temp isa Vector
-            newtemp = []
-            for t in temp
-                fix_waitdicts!(state, deadlist, t)
-                if !(t in deadlist)
-                    push!(newtemp, t)
-                end
-            end
-            isempty(newtemp) || push!(state.running, newtemp)
-        else
-            throw("Unexpected type in recovery: $temp")
-        end
+    # Fixup state.waiting and state.waiting_data
+    for t in deadlist
+        fix_waitdicts!(state, deadlist, t)
     end
 
     # Reschedule inputs from deadlist
-    @assert !isempty(procs(ctx)) "No workers left for fault handling!"
     while length(deadlist) > 0
         dt = popfirst!(deadlist)
         if any((input in deadlist) for input in dt.inputs)
@@ -115,5 +88,10 @@ function handle_fault(ctx, state, oldproc)
         end
         push!(state.ready, dt)
     end
+    =#
+    seen = Dict{Thunk,Bool}()
+    for t in deadlist
+        reschedule_inputs!(state, t, seen)
+    end
     schedule!(ctx, state)
 end
diff --git a/src/sch/util.jl b/src/sch/util.jl
@@ -10,7 +10,8 @@ unwrap_nested_exception(err::RemoteException) =
 unwrap_nested_exception(err) = err
 
 "Prepares the scheduler to schedule `thunk`."
-function reschedule_inputs!(state, thunk)
+function reschedule_inputs!(state, thunk, seen=Dict{Thunk,Bool}())
+    haskey(seen, thunk) && return seen[thunk]
     w = get!(()->Set{Thunk}(), state.waiting, thunk)
     scheduled = false
     for input in thunk.inputs
@@ -26,14 +27,17 @@ function reschedule_inputs!(state, thunk)
         haskey(state.cache, input) && continue
         if (input in state.running) ||
            (input in state.ready) ||
-           reschedule_inputs!(state, input)
+           reschedule_inputs!(state, input, seen)
             push!(w, input)
             scheduled = true
+        else
+            error("Failed to reschedule $(input.id) for $(thunk.id)")
         end
     end
     if isempty(w) && !(thunk in state.errored)
         # Inputs are ready
         push!(state.ready, thunk)
+        delete!(state.waiting, thunk)
         return true
     else
         return scheduled