diff --git a/NEWS.md b/NEWS.md index 9aebf5d42d954..86db7e8623256 100644 --- a/NEWS.md +++ b/NEWS.md @@ -101,6 +101,8 @@ New library features content is fully written, then call `closewrite` manually to avoid data-races. Or use the callback form of `open` to have all that handled automatically. +* `@time` now shows if the time spent on scheduling tasks (not including sleeping/waiting) is more + than 10% of the total time, as a percentage. ([#55103]) * `@timed` now additionally returns the elapsed compilation and recompilation time ([#52889]) * `escape_string` takes additional keyword arguments `ascii=true` (to escape all non-ASCII characters) and `fullhex=true` (to require full 4/8-digit hex numbers diff --git a/base/Base.jl b/base/Base.jl index 84e10ca788ba2..3662d3be841a2 100644 --- a/base/Base.jl +++ b/base/Base.jl @@ -424,6 +424,10 @@ include("weakkeydict.jl") # ScopedValues include("scopedvalues.jl") +# used by task.jl +const task_times_per_thread = ScopedValues.ScopedValue{Vector{UInt64}}() # time_ns always returns UInt64 +const sleep_times_per_thread = ScopedValues.ScopedValue{Vector{UInt64}}() + # metaprogramming include("meta.jl") diff --git a/base/task.jl b/base/task.jl index 6cb1ff785eeee..44d437cf3f647 100644 --- a/base/task.jl +++ b/base/task.jl @@ -1098,7 +1098,18 @@ end function try_yieldto(undo) try + # when timing time spent in the scheduler we time the time spent in the task + # then subtract the elapsed time in `@timed` from the total time to get the scheduler time + t, ts = isassigned(task_times_per_thread) ? (time_ns(), task_times_per_thread[]) : (nothing, nothing) ccall(:jl_switch, Cvoid, ()) + if ts !== nothing + t = time_ns() - t + tid = Threads.threadid() + if tid <= length(ts) + ts[tid] += t + end + # TODO: grow the array if needed i.e. if a thread is added within the `@timed` block + end catch undo(ccall(:jl_get_next_task, Ref{Task}, ())) rethrow() @@ -1170,8 +1181,19 @@ end function wait() GC.safepoint() - W = workqueue_for(Threads.threadid()) + tid = Threads.threadid() + W = workqueue_for(tid) + # optional sleep timing + t, ts = isassigned(sleep_times_per_thread) ? (time_ns(), sleep_times_per_thread[]) : (nothing, nothing) poptask(W) + if ts !== nothing # sleep timing is enabled + t = time_ns() - t + if tid <= length(ts) + ts[tid] += t + end + # TODO: grow the array if needed i.e. if a thread is added within the `@timed` block + end + result = try_yieldto(ensure_rescheduled) process_events() # return when we come out of the queue diff --git a/base/timing.jl b/base/timing.jl index b094aa230e1c2..3b925fd63bb00 100644 --- a/base/timing.jl +++ b/base/timing.jl @@ -196,9 +196,11 @@ function format_bytes(bytes; binary=true) # also used by InteractiveUtils end end -function time_print(io::IO, elapsedtime, bytes=0, gctime=0, allocs=0, lock_conflicts=0, compile_time=0, recompile_time=0, newline=false; - msg::Union{String,Nothing}=nothing) +function time_print(io::IO, elapsedtime, bytes=0, gctime=0, allocs=0, lock_conflicts=0, compile_time=0, recompile_time=0, sched_time_avg=0, + newline=false; msg::Union{String,Nothing}=nothing) timestr = Ryu.writefixed(Float64(elapsedtime/1e9), 6) + wall_time_sched_perc = sched_time_avg / (elapsedtime / 1e9) + sched_thresh = 0.1 str = sprint() do io if msg isa String print(io, msg, ": ") @@ -206,7 +208,7 @@ function time_print(io::IO, elapsedtime, bytes=0, gctime=0, allocs=0, lock_confl print(io, length(timestr) < 10 ? (" "^(10 - length(timestr))) : "") end print(io, timestr, " seconds") - parens = bytes != 0 || allocs != 0 || gctime > 0 || lock_conflicts > 0 || compile_time > 0 + parens = bytes != 0 || allocs != 0 || gctime > 0 || wall_time_sched_perc > sched_thresh || lock_conflicts > 0 || compile_time > 0 parens && print(io, " (") if bytes != 0 || allocs != 0 allocs, ma = prettyprint_getunits(allocs, length(_cnt_units), Int64(1000)) @@ -223,15 +225,21 @@ function time_print(io::IO, elapsedtime, bytes=0, gctime=0, allocs=0, lock_confl end print(io, Ryu.writefixed(Float64(100*gctime/elapsedtime), 2), "% gc time") end - if lock_conflicts > 0 + if wall_time_sched_perc > sched_thresh if bytes != 0 || allocs != 0 || gctime > 0 print(io, ", ") end + print(io, Ryu.writefixed(Float64(100*wall_time_sched_perc), 2), "% scheduling time") + end + if lock_conflicts > 0 + if bytes != 0 || allocs != 0 || gctime > 0 || wall_time_sched_perc > sched_thresh + print(io, ", ") + end plural = lock_conflicts == 1 ? "" : "s" print(io, lock_conflicts, " lock conflict$plural") end if compile_time > 0 - if bytes != 0 || allocs != 0 || gctime > 0 || lock_conflicts > 0 + if bytes != 0 || allocs != 0 || gctime > 0 || wall_time_sched_perc > sched_thresh || lock_conflicts > 0 print(io, ", ") end print(io, Ryu.writefixed(Float64(100*compile_time/elapsedtime), 2), "% compilation time") @@ -285,7 +293,9 @@ A macro to execute an expression, printing the time it took to execute, the numb allocations, and the total number of bytes its execution caused to be allocated, before returning the value of the expression. Any time spent garbage collecting (gc), compiling new code, or recompiling invalidated code is shown as a percentage. Any lock conflicts -where a [`ReentrantLock`](@ref) had to wait are shown as a count. +where a [`ReentrantLock`](@ref) had to wait are shown as a count. If the time spent on +scheduling tasks (not including sleeping/waiting) is more than 10% of the total time it +is also shown as a percentage. Optionally provide a description string to print before the time report. @@ -309,6 +319,9 @@ See also [`@showtime`](@ref), [`@timev`](@ref), [`@timed`](@ref), [`@elapsed`](@ !!! compat "Julia 1.11" The reporting of any lock conflicts was added in Julia 1.11. +!!! compat "Julia 1.12" + The reporting of excessive scheduling time was added in Julia 1.12. + ```julia-repl julia> x = rand(10,10); @@ -346,7 +359,7 @@ macro time(msg, ex) local ret = @timed $(esc(ex)) local _msg = $(esc(msg)) local _msg_str = _msg === nothing ? _msg : string(_msg) - time_print(stdout, ret.time*1e9, ret.gcstats.allocd, ret.gcstats.total_time, gc_alloc_count(ret.gcstats), ret.lock_conflicts, ret.compile_time*1e9, ret.recompile_time*1e9, true; msg=_msg_str) + time_print(stdout, ret.time*1e9, ret.gcstats.allocd, ret.gcstats.total_time, gc_alloc_count(ret.gcstats), ret.lock_conflicts, ret.compile_time*1e9, ret.recompile_time*1e9, ret.sched_time_avg, true; msg=_msg_str) ret.value end end @@ -555,8 +568,9 @@ end A macro to execute an expression, and return the value of the expression, elapsed time in seconds, total bytes allocated, garbage collection time, an object with various memory allocation -counters, compilation time in seconds, and recompilation time in seconds. Any lock conflicts -where a [`ReentrantLock`](@ref) had to wait are shown as a count. +counters, compilation time in seconds, with recompilation time in seconds, any lock conflicts +where a [`ReentrantLock`](@ref) had to wait as a count, and the average time spent on +scheduling tasks (not including sleeping/waiting) across threads. In some cases the system will look inside the `@timed` expression and compile some of the called code before execution of the top-level expression begins. When that happens, some @@ -596,33 +610,50 @@ julia> stats.recompile_time !!! compat "Julia 1.11" The `lock_conflicts`, `compile_time`, and `recompile_time` fields were added in Julia 1.11. + +!!! compat "Julia 1.12" + The `sched_time_avg` field was added in Julia 1.11. """ macro timed(ex) quote Experimental.@force_compile - Threads.lock_profiling(true) - local lock_conflicts = Threads.LOCK_CONFLICT_COUNT[] - local stats = gc_num() - local elapsedtime = time_ns() - cumulative_compile_timing(true) - local compile_elapsedtimes = cumulative_compile_time_ns() - local val = @__tryfinally($(esc(ex)), - (elapsedtime = time_ns() - elapsedtime; - cumulative_compile_timing(false); - compile_elapsedtimes = cumulative_compile_time_ns() .- compile_elapsedtimes; - lock_conflicts = Threads.LOCK_CONFLICT_COUNT[] - lock_conflicts; - Threads.lock_profiling(false)) - ) - local diff = GC_Diff(gc_num(), stats) - ( - value=val, - time=elapsedtime/1e9, - bytes=diff.allocd, - gctime=diff.total_time/1e9, - gcstats=diff, - lock_conflicts=lock_conflicts, - compile_time=compile_elapsedtimes[1]/1e9, - recompile_time=compile_elapsedtimes[2]/1e9 - ) + ScopedValues.@with task_times_per_thread => zeros(UInt, Threads.maxthreadid()) sleep_times_per_thread => zeros(UInt, Threads.maxthreadid()) begin + Experimental.@force_compile + Threads.lock_profiling(true) + local lock_conflicts = Threads.LOCK_CONFLICT_COUNT[] + local stats = gc_num() + local elapsedtime = time_ns() + cumulative_compile_timing(true) + local compile_elapsedtimes = cumulative_compile_time_ns() + local val = @__tryfinally($(esc(ex)), + (elapsedtime = time_ns() - elapsedtime; + cumulative_compile_timing(false); + compile_elapsedtimes = cumulative_compile_time_ns() .- compile_elapsedtimes; + lock_conflicts = Threads.LOCK_CONFLICT_COUNT[] - lock_conflicts; + Threads.lock_profiling(false)) + ) + local diff = GC_Diff(gc_num(), stats) + local sched_times = Int64[] + for i in 1:length(task_times_per_thread[]) + # filter out zeros in task timers which can only happen if nothing was scheduled + if task_times_per_thread[][i] != 0 + # subtract task and sleep times from global elapsed time to get scheduling time per thread + push!(sched_times, Int64(elapsedtime) - Int64(task_times_per_thread[][i]) - Int64(sleep_times_per_thread[][i])) + end + end + local sched_time_avg = isempty(sched_times) ? 0 : sum(sched_times) / length(sched_times) + + ( + value=val, + time=elapsedtime/1e9, + bytes=diff.allocd, + gctime=diff.total_time/1e9, + gcstats=diff, + lock_conflicts=lock_conflicts, + compile_time=compile_elapsedtimes[1]/1e9, + recompile_time=compile_elapsedtimes[2]/1e9, + sched_time_avg=sched_time_avg/1e9 + ) + end end end diff --git a/test/scopedvalues.jl b/test/scopedvalues.jl index 2c2f4a510c1c9..056a8b425c50d 100644 --- a/test/scopedvalues.jl +++ b/test/scopedvalues.jl @@ -82,11 +82,16 @@ end @testset "show" begin @test sprint(show, ScopedValue{Int}()) == "Base.ScopedValues.ScopedValue{$Int}(undefined)" @test sprint(show, sval) == "Base.ScopedValues.ScopedValue{$Int}(1)" - @test sprint(show, Core.current_scope()) == "nothing" + if Core.current_scope() === nothing + # Base.runtests uses @timed which introduces a scope for scheduler timing + @test sprint(show, Core.current_scope()) == "nothing" + end with(sval => 2.0) do @test sprint(show, sval) == "Base.ScopedValues.ScopedValue{$Int}(2)" objid = sprint(show, Base.objectid(sval)) - @test sprint(show, Core.current_scope()) == "Base.ScopedValues.Scope(Base.ScopedValues.ScopedValue{$Int}@$objid => 2)" + # Base.runtests uses @timed which introduces a scope for scheduler timing + @test startswith(sprint(show, Core.current_scope()), "Base.ScopedValues.Scope(") + @test contains(sprint(show, Core.current_scope()), "Base.ScopedValues.ScopedValue{$Int}@$objid => 2") end end