Skip to content

Commit faca41c

Browse files
committed
Temp commit for visualization
1 parent 8c79341 commit faca41c

File tree

7 files changed

+179
-69
lines changed

7 files changed

+179
-69
lines changed

benchmarks/benchmark.jl

+61-28
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#ccall(:jl_exit_on_sigint, Cvoid, (Cint,), 0)
2+
13
using Distributed
24
if haskey(ENV, "BENCHMARK_PROCS")
35
const np, nt = parse.(Ref(Int), split(ENV["BENCHMARK_PROCS"], ":"))
@@ -40,6 +42,10 @@ elseif render == "offline"
4042
using FFMPEG, FileIO, ImageMagick
4143
end
4244
const RENDERS = Dict{Int,Dict}()
45+
const live_port = parse(Int, get(ENV, "BENCHMARK_LIVE_PORT", "8000"))
46+
47+
const graph = parse(Bool, get(ENV, "BENCHMARK_GRAPH", "0"))
48+
const profile = parse(Bool, get(ENV, "BENCHMARK_PROFILE", "0"))
4349

4450
_benches = get(ENV, "BENCHMARK", "cpu,cpu+dagger")
4551
const benches = []
@@ -124,7 +130,7 @@ end
124130

125131
theory_flops(nrow, ncol, nfeatures) = 11 * ncol * nrow * nfeatures + 2 * (ncol + nrow) * nfeatures
126132

127-
function nmf_suite(; dagger, accel, network, kwargs...)
133+
function nmf_suite(ctx; dagger, accel, network)
128134
suite = BenchmarkGroup()
129135

130136
#= TODO: Re-enable
@@ -194,59 +200,67 @@ function nmf_suite(; dagger, accel, network, kwargs...)
194200
else
195201
error("Unknown network $network")
196202
end
203+
rr = true
197204
opts = if accel == "cuda"
198205
Dagger.Sch.SchedulerOptions(;proctypes=[
199206
DaggerGPU.CuArrayDeviceProc
200-
], network=net)
207+
], network=net,round_robin=rr)
201208
elseif accel == "amdgpu"
202209
Dagger.Sch.SchedulerOptions(;proctypes=[
203210
DaggerGPU.ROCArrayProc
204-
], network=net)
211+
], network=net,round_robin=rr)
205212
elseif accel == "cpu"
206-
Dagger.Sch.SchedulerOptions(;network=net)
213+
Dagger.Sch.SchedulerOptions(;network=net,round_robin=rr)
207214
else
208215
error("Unknown accelerator $accel")
209216
end
210-
ctx = Context(collect((1:nw) .+ 1); kwargs...)
211217
p = sum([length(Dagger.get_processors(OSProc(id))) for id in 2:(nw+1)])
218+
#bsz = ncol ÷ length(workers())
219+
bsz = ncol ÷ 64
212220
nsuite["Workers: $nw"] = @benchmarkable begin
213-
compute($ctx, nnmf($X[], $W[], $H[]); options=$opts)
221+
_ctx = Context($ctx, workers()[1:$nw])
222+
compute(_ctx, nnmf($X[], $W[], $H[]); options=$opts)
214223
end setup=begin
215224
_nw, _scale = $nw, $scale
216225
@info "Starting $_nw worker Dagger NNMF (scale by $_scale)"
217-
if render != ""
218-
Dagger.show_gantt($ctx; width=1800, window_length=20, delay=2, port=4040, live=live)
219-
end
220226
if $accel == "cuda"
221227
# FIXME: Allocate with CUDA.rand if possible
222-
$X[] = Dagger.mapchunks(CUDA.cu, compute(rand(Blocks($nrow, $ncol÷$p), Float32, $nrow, $ncol); options=$opts))
223-
$W[] = Dagger.mapchunks(CUDA.cu, compute(rand(Blocks($nrow, $ncol÷$p), Float32, $nrow, $nfeatures); options=$opts))
224-
$H[] = Dagger.mapchunks(CUDA.cu, compute(rand(Blocks($nrow, $ncol÷$p), Float32, $nfeatures, $ncol); options=$opts))
228+
$X[] = Dagger.mapchunks(CUDA.cu, compute(rand(Blocks($nrow, $bsz), Float32, $nrow, $ncol); options=$opts))
229+
$W[] = Dagger.mapchunks(CUDA.cu, compute(rand(Blocks($nrow, $bsz), Float32, $nrow, $nfeatures); options=$opts))
230+
$H[] = Dagger.mapchunks(CUDA.cu, compute(rand(Blocks($nrow, $bsz), Float32, $nfeatures, $ncol); options=$opts))
225231
elseif $accel == "amdgpu"
226232
$X[] = Dagger.mapchunks(ROCArray, compute(rand(Blocks($nrow, $ncol÷$p), Float32, $nrow, $ncol); options=$opts))
227233
$W[] = Dagger.mapchunks(ROCArray, compute(rand(Blocks($nrow, $ncol÷$p), Float32, $nrow, $nfeatures); options=$opts))
228234
$H[] = Dagger.mapchunks(ROCArray, compute(rand(Blocks($nrow, $ncol÷$p), Float32, $nfeatures, $ncol); options=$opts))
229235
elseif $accel == "cpu"
230-
$X[] = compute(rand(Blocks($nrow, $ncol÷$p), Float32, $nrow, $ncol); options=$opts)
231-
$W[] = compute(rand(Blocks($nrow, $ncol÷$p), Float32, $nrow, $nfeatures); options=$opts)
232-
$H[] = compute(rand(Blocks($nrow, $ncol÷$p), Float32, $nfeatures, $ncol); options=$opts)
236+
$X[] = compute(rand(Blocks($nrow, $bsz), Float32, $nrow, $ncol); options=$opts)
237+
$W[] = compute(rand(Blocks($nrow, $bsz), Float32, $nrow, $nfeatures); options=$opts)
238+
$H[] = compute(rand(Blocks($nrow, $bsz), Float32, $nfeatures, $ncol); options=$opts)
233239
end
234240
end teardown=begin
235-
if render != ""
241+
if render != "" && !live
236242
Dagger.continue_rendering[] = false
237-
video_paths = take!(Dagger.render_results)
238-
try
239-
video_data = Dict(key=>read(video_paths[key]) for key in keys(video_paths))
240-
push!(get!(()->[], RENDERS[$scale], $nw), video_data)
241-
catch
243+
for i in 1:5
244+
isready(Dagger.render_results) && break
245+
sleep(1)
246+
end
247+
if isready(Dagger.render_results)
248+
video_paths = take!(Dagger.render_results)
249+
try
250+
video_data = Dict(key=>read(video_paths[key]) for key in keys(video_paths))
251+
push!(get!(()->[], RENDERS[$scale], $nw), video_data)
252+
catch err
253+
@error "Failed to process render results" exception=(err,catch_backtrace())
254+
end
255+
else
256+
@warn "Failed to fetch render results"
242257
end
243258
end
244259
$X[] = nothing
245260
$W[] = nothing
246261
$H[] = nothing
247262
@everywhere GC.gc()
248263
end
249-
break
250264
nw ÷= 2
251265
end
252266
suite["NNMF scaled by: $scale"] = nsuite
@@ -261,28 +275,42 @@ function main()
261275
output_prefix = "result-$(np)workers-$(nt)threads-$(Dates.now())"
262276

263277
suites = Dict()
278+
graph_opts = if graph && render != ""
279+
(log_sink=Dagger.LocalEventLog(), log_file=output_prefix*".dot")
280+
elseif render != ""
281+
(log_sink=Dagger.LocalEventLog(),)
282+
else
283+
NamedTuple()
284+
end
285+
ctx = Context(collect((1:nw) .+ 1); profile=profile, graph_opts...)
264286
for bench in benches
265287
name = bench.name
266288
println("creating $name benchmarks")
267-
suites[name] = if bench.dagger
268-
nmf_suite(; dagger=true, accel=bench.accel, network=bench.network, log_sink=Dagger.LocalEventLog(), log_file=output_prefix*".dot", profile=false)
269-
else
270-
nmf_suite(; dagger=false, accel=bench.accel, network=bench.network)
289+
suites[name] = nmf_suite(ctx; dagger=bench.dagger, accel=bench.accel, network=bench.network)
290+
end
291+
if render != ""
292+
Dagger.show_gantt(ctx; width=1800, window_length=5, delay=2, port=live_port, live=live)
293+
if live
294+
# Make sure server code is compiled
295+
sleep(1)
296+
run(pipeline(`curl -s localhost:$live_port/`; stdout=devnull))
297+
run(pipeline(`curl -s localhost:$live_port/profile`; stdout=devnull))
298+
@info "Rendering started on port $live_port"
271299
end
272300
end
273301
res = Dict()
274302
for bench in benches
275303
name = bench.name
276304
println("running $name benchmarks")
277305
res[name] = try
278-
run(suites[name]; samples=5, seconds=10*60, gcsample=true)
306+
run(suites[name]; samples=3, seconds=10*60, gcsample=true)
279307
catch err
280308
@error "Error running $name benchmarks" exception=(err,catch_backtrace())
281309
nothing
282310
end
283311
end
284312
for bench in benches
285-
println("benchmark results for $(bench.name): $(res[bench.name])")
313+
println("benchmark results for $(bench.name): $(minimum(res[bench.name]))")
286314
end
287315

288316
println("saving results in $output_prefix.$output_format")
@@ -294,6 +322,11 @@ function main()
294322
serialize(io, outdict)
295323
end
296324
end
325+
326+
if parse(Bool, get(ENV, "BENCHMARK_VISUALIZE", "0"))
327+
run(`$(Base.julia_cmd()) $(joinpath(pwd(), "visualize.jl")) -- $(output_prefix*"."*output_format)`)
328+
end
329+
297330
println("Done.")
298331

299332
# TODO: Compare with multiple results

benchmarks/visualize.jl

+68-26
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,61 @@
1-
using JLD
1+
using JLD, Serialization
22
using BenchmarkTools
33
using TypedTables
44

5-
res = JLD.load(ARGS[1])
5+
res = if endswith(ARGS[1], ".jld")
6+
JLD.load(ARGS[1])
7+
elseif endswith(ARGS[1], ".jls")
8+
deserialize(ARGS[1])
9+
else
10+
error("Unknown file type")
11+
end
612

7-
serial_results = res["results"]["Serial"]
8-
dagger_results = res["results"]["Dagger"]
13+
serial_results = filter(x->!occursin("dagger", x[1]), res["results"])
14+
@assert length(keys(serial_results)) > 0 "No serial results found"
15+
dagger_results = filter(x->occursin("dagger", x[1]), res["results"])
16+
@assert length(keys(dagger_results)) > 0 "No Dagger results found"
17+
18+
scale_set = sort([key=>parse(Int, lstrip(last(split(key, ':')), ' ')) for key in keys(first(serial_results)[2])]; by=x->x[2])
19+
nw_set = sort([key=>parse(Int, lstrip(last(split(key, ':')), ' ')) for key in keys(first(dagger_results)[2][first(first(scale_set))])]; by=x->x[2])
20+
raw_table = NamedTuple[]
21+
for bset_key in keys(res["results"])
22+
bset = res["results"][bset_key]
23+
if typeof(bset[first(first(scale_set))]) <: BenchmarkGroup
24+
procs = parse(Int, lstrip(last(split(first(first(bset[first(first(scale_set))])), ':')), ' '))
25+
for nw in nw_set
26+
for i in 1:length(scale_set)
27+
set_times = [minimum(bset[scale][nw[1]]).time/(10^9) for scale in first.(scale_set)]
28+
push!(raw_table, (name=bset_key, time=set_times[i], scale=last.(scale_set)[i], procs=nw[2]))
29+
end
30+
end
31+
else
32+
set_times = [minimum(bset[scale]).time/(10^9) for scale in first.(scale_set)]
33+
procs = 8 # default for OpenBLAS
34+
for i in 1:length(set_times)
35+
push!(raw_table, (name=bset_key, time=set_times[i], scale=last.(scale_set)[i], procs=procs))
36+
end
37+
end
38+
end
39+
table = Table(raw_table)
940

10-
scale_set = sort([key=>parse(Int, lstrip(last(split(key, ':')), ' ')) for key in keys(serial_results)]; by=x->x[2])
11-
serial_times = [minimum(serial_results[scale]).time/(10^9) for scale in first.(scale_set)]
12-
nw_set = sort([key=>parse(Int, lstrip(last(split(key, ':')), ' ')) for key in keys(dagger_results[first(first(scale_set))])]; by=x->x[2])
41+
@show table
42+
btable = copy(table[map(x->!x, occursin.(Ref("dagger"), table.name))])
43+
dtable = copy(table[occursin.(Ref("dagger"), table.name)])
44+
@show btable dtable
1345

14-
table = Table(name=[:Base for _ in 1:3], time=serial_times, scale=last.(scale_set), procs=[8 for _ in 1:3])
46+
#table = Table(name=[:Base for _ in 1:3], time=serial_times, scale=last.(scale_set), procs=[8 for _ in 1:3])
1547

16-
btable = copy(table)
48+
#btable = copy(table)
1749

50+
#=
1851
for (nw,nw_val) in nw_set
1952
dagger_times = [minimum(dagger_results[scale][nw]).time/(10^9) for scale in first.(scale_set)]
2053
t = Table(name=[:Dagger for _ in 1:3], time=dagger_times, scale=last.(scale_set), procs=[parse(Int,split(nw, ":")[2]) for _ in 1:3])
2154
append!(table, t)
2255
end
56+
=#
2357

24-
dtable = table[table.name .== :Dagger]
58+
#dtable = table[table.name .== :Dagger]
2559

2660
# Plotting
2761

@@ -45,11 +79,11 @@ legend_names = String[]
4579

4680
scales = unique(dtable.scale)
4781

48-
colors = distinguishable_colors(lenght(scales), ColorSchemes.seaborn_deep.colors)
82+
colors = distinguishable_colors(length(scales), ColorSchemes.seaborn_deep.colors)
4983

5084
for (i, scale) in enumerate(scales)
5185
stable = dtable[dtable.scale .== scale]
52-
t1 = first(stable[stable.procs .== 1].time)
86+
t1 = first(stable[stable.procs .== minimum(dtable.procs)].time)
5387
ss_efficiency = strong_scaling.(t1, stable.time, stable.procs)
5488
push!(line_plots, lines!(ssp, stable.procs, ss_efficiency, linewidth=3.0, color = colors[i]))
5589
push!(legend_names, "scale = $scale")
@@ -65,25 +99,33 @@ save("strong_scaling.png", fig)
6599
# too little data
66100

67101
fig = Figure(resolution = (1200, 800))
68-
weak_scaling(t1, tn) = t1/tn
102+
weak_scaling(t1, tn, p_prime, p) = t1/((p_prime/p)*tn)
69103

70-
dtable = table[table.name .== :Dagger]
71-
wstable = filter(row->row.scale == row.procs, dtable)
72-
wstable = sort(wstable, by=r->r.scale)
73-
t1 = first(wstable).time
104+
t1 = first(dtable[map(row->(row.scale == 10) && (row.procs == 1), dtable)]).time
74105

75106
fig = Figure(resolution = (1200, 800))
76-
perf = fig[1, 1] = Axis(fig, title = "Weak scaling")
77-
perf.xlabel = "nprocs"
78-
perf.ylabel = "Efficiency"
107+
perf = fig[1, 1] = Axis(fig, title = "Weak Scaling")
108+
perf.xlabel = "Number of processes"
109+
perf.ylabel = "Scaling efficiency"
110+
111+
line_plots = Any[]
112+
legend_names = String[]
113+
114+
wstable = similar(dtable, 0)
115+
for pair in [(10,1),(35,4),(85,8)]
116+
append!(wstable, dtable[map(row->(row.scale == pair[1]) && (row.procs == pair[2]), rows(dtable))])
117+
end
118+
@show wstable
119+
push!(line_plots, lines!(perf, wstable.procs, weak_scaling.(t1, wstable.time, wstable.procs .* 10, wstable.scale), linewidth=3.0))
120+
push!(legend_names, "cpu+dagger")
79121

80-
lines!(perf, wstable.procs, weak_scaling.(t1, wstable.time), linewidth=3.0)
122+
legend = fig[1, 2] = Legend(fig, line_plots, legend_names)
81123
save("weak_scaling.png", fig)
82124

83125
# 3. Comparision against Base
84126

85127
fig = Figure(resolution = (1200, 800))
86-
perf = fig[1, 1] = Axis(fig, title = "DaggerArrays vs Base")
128+
perf = fig[1, 1] = Axis(fig, title = "Dagger vs Base")
87129
perf.xlabel = "Scaling factor"
88130
perf.ylabel = "time (s)"
89131

@@ -92,7 +134,7 @@ legend_names = String[]
92134

93135
procs = unique(dtable.procs)
94136

95-
colors = distinguishable_colors(lenght(procs) + 1, ColorSchemes.seaborn_deep.colors)
137+
colors = distinguishable_colors(length(procs) + 1, ColorSchemes.seaborn_deep.colors)
96138

97139
for (i, nproc) in enumerate(procs)
98140
stable = dtable[dtable.procs .== nproc]
@@ -109,9 +151,9 @@ save("raw_timings.png", fig)
109151

110152
# 4. Speedup
111153
fig = Figure(resolution = (1200, 800))
112-
speedup = fig[1, 1] = Axis(fig, title = "DaggerArrays vs Base (8 threads)")
154+
speedup = fig[1, 1] = Axis(fig, title = "Dagger vs Base (8 threads)")
113155
speedup.xlabel = "Scaling factor"
114-
speedup.ylabel = "Speedup Base/Dagger"
156+
speedup.ylabel = "Runtime Dagger/Base"
115157

116158
line_plots = Any[]
117159
legend_names = String[]
@@ -121,7 +163,7 @@ colors = distinguishable_colors(length(procs), ColorSchemes.seaborn_deep.colors)
121163
sort!(btable, by=r->r.scale)
122164

123165
for (i, nproc) in enumerate(unique(dtable.procs))
124-
nproc < 8 && continue
166+
nproc == 8 || continue
125167
stable = dtable[dtable.procs .== nproc]
126168
sort!(stable, by=r->r.scale)
127169
push!(line_plots, lines!(speedup, stable.scale, btable.time ./ stable.time, linewidth=3.0, color = colors[i]))

src/lib/logging.jl

+12-5
Original file line numberDiff line numberDiff line change
@@ -116,11 +116,13 @@ function raise_event(ctx, phase, category, id,tl, t, gc_num, prof, async)
116116
end
117117
end
118118

119-
empty_prof() = ProfilerResult(UInt[], Profile.getdict(UInt[]))
119+
empty_prof() = ProfilerResult(UInt[], Dict{UInt64, Vector{Base.StackTraces.StackFrame}}())
120+
121+
const prof_refcount = Ref{Threads.Atomic{Int}}(Threads.Atomic{Int}(0))
120122

121123
function timespan_start(ctx, category, id, tl, async=isasync(ctx.log_sink))
122124
isa(ctx.log_sink, NoOpLog) && return # don't go till raise
123-
if ctx.profile && category == :compute
125+
if ctx.profile && category == :compute && Threads.atomic_add!(prof_refcount[], 1) == 0
124126
Profile.start_timer()
125127
end
126128
raise_event(ctx, :start, category, id, tl, time_ns(), gc_num(), empty_prof(), async)
@@ -129,13 +131,18 @@ end
129131

130132
function timespan_end(ctx, category, id, tl, async=isasync(ctx.log_sink))
131133
isa(ctx.log_sink, NoOpLog) && return
134+
time = time_ns()
135+
gcn = gc_num()
132136
prof = UInt[]
137+
lidict = Dict{UInt64, Vector{Base.StackTraces.StackFrame}}()
133138
if ctx.profile && category == :compute
134-
Profile.stop_timer()
135-
prof = Profile.fetch()
139+
if Threads.atomic_sub!(prof_refcount[], 1) == 1
140+
Profile.stop_timer()
141+
end
142+
prof, lidict = Profile.retrieve()
136143
Profile.clear()
137144
end
138-
raise_event(ctx, :finish, category, id, tl,time_ns(), gc_num(), ProfilerResult(prof, Profile.getdict(prof)), async)
145+
raise_event(ctx, :finish, category, id, tl, time, gcn, ProfilerResult(prof, lidict), async)
139146
nothing
140147
end
141148

src/processor.jl

+2
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,8 @@ Context(procs::Vector{P}=Processor[OSProc(w) for w in workers()];
264264
profile=false, options=nothing) where {P<:Processor} =
265265
Context(procs, proc_lock, log_sink, log_file, profile, options)
266266
Context(xs::Vector{Int}; kwargs...) = Context(map(OSProc, xs); kwargs...)
267+
Context(ctx::Context, xs::Vector) =
268+
Context(xs; log_sink=ctx.log_sink, log_file=ctx.log_file, profile=ctx.profile)
267269
procs(ctx::Context) = lock(ctx) do
268270
copy(ctx.procs)
269271
end

src/sch/unix.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ struct TimeSpec
1313
tv_nsec :: UInt64
1414
end
1515

16-
maketime(ts) = ts.tv_sec * 1e9 + ts.tv_nsec
16+
maketime(ts) = ts.tv_sec * UInt(1e9) + ts.tv_nsec
1717

1818
# From bits/times.h on a Linux system
1919
# Check if those are the same on BSD

0 commit comments

Comments
 (0)