From 38e41884b27aee8fcc1e43008b4435a56965bf3c Mon Sep 17 00:00:00 2001 From: Timo Kluck Date: Fri, 3 May 2019 17:05:33 +0200 Subject: [PATCH 1/2] WIP: statistical allocation profiling Julia has a mature statistical profiler. It sets a timer that captures a backtrace when it is triggered. By the law of large numbers, this gives insight into where an algorithm spends its time, without noticably slowing the program down. By comparison, finding out where the allocations are happening is quite bit more cumbersome. It needs starting Julia with a specific command line switch, code execution is _much_ slower, and after program exit, the results are scattered over the file system. This pull request represents an attempt at bringing the ergonomics of statistical _runtime_ profiling to allocations: "statistical allocation profiling". Similar to how, in the former case, `Profile.init` configures a delay between backtraces, this branch add an option to specify a fraction of allocations that capture a backtrace. Example usage: ```julia using Profile Profile.init(alloc_rate = 0.01) doublefibonacci(n) = if n <= 2 return [1, 1] else return doublefibonacci(n - 1) .+ doublefibonacci(n - 2) end @profile for i=1:1000; doublefibonacci(15); end Profile.print() # but better to use e.g. ProfileView or StatProfilerHTML ``` State of this commit: - linux support only - not thread-safe - no attempt at a friendly human interface; as it is, the `Profile.init` API almost encourages a linear combination of runtime and allocation profiling. That makes no sense at all. --- base/util.jl | 5 +++ src/gc.c | 28 +++++++++++++++ src/gc.h | 3 ++ src/signals-unix.c | 65 +++++++++++++++++++++-------------- stdlib/Profile/src/Profile.jl | 5 +-- 5 files changed, 79 insertions(+), 27 deletions(-) diff --git a/base/util.jl b/base/util.jl index 11fcc14a39c7c..c072d076df888 100644 --- a/base/util.jl +++ b/base/util.jl @@ -21,6 +21,11 @@ end gc_num() = ccall(:jl_gc_num, GC_Num, ()) +# TODO(tkluck): not sure if it's necessary to expose these; we can alternatively just use +# ccall in the Profile module. +gc_get_statprofile_sample() = ccall(:jl_gc_get_statprofile_sample, Cdouble, ()) +gc_set_statprofile_sample!(n) = ccall(:jl_gc_set_statprofile_sample, Cvoid, (Cdouble,), n) + # This type is to represent differences in the counters, so fields may be negative struct GC_Diff allocd ::Int64 # Bytes allocated diff --git a/src/gc.c b/src/gc.c index 3706acffc5e3f..eec5d452e221b 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1,5 +1,7 @@ // This file is a part of Julia. License is MIT: https://julialang.org/license +#include // for rand + #include "gc.h" #include "julia_gcext.h" #include "julia_assert.h" @@ -8,6 +10,9 @@ extern "C" { #endif +// declaration rom signals-unix.c +int jl_profile_record_trace(bt_context_t *ctx); + // Linked list of callback functions typedef void (*jl_gc_cb_func_t)(void); @@ -165,6 +170,7 @@ static int support_conservative_marking = 0; jl_gc_num_t gc_num = {0,0,0,0,0,0,0,0,0,0,0,0,0,0}; static size_t last_long_collect_interval; +static int gc_statprofile_sample_rate = 0; pagetable_t memory_map; @@ -882,6 +888,8 @@ JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz) gc_num.allocd += allocsz; #endif gc_num.bigalloc++; + if(gc_statprofile_sample_rate && rand() < gc_statprofile_sample_rate) + jl_profile_record_trace(NULL); #ifdef MEMDEBUG memset(v, 0xee, allocsz); #endif @@ -1108,6 +1116,8 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset, jl_gc_safepoint_(ptls); } gc_num.poolalloc++; + if(gc_statprofile_sample_rate && rand() < gc_statprofile_sample_rate) + jl_profile_record_trace(NULL); // first try to use the freelist jl_taggedvalue_t *v = p->freelist; if (v) { @@ -2613,6 +2623,14 @@ JL_DLLEXPORT jl_gc_num_t jl_gc_num(void) { return gc_num; } +JL_DLLEXPORT double jl_gc_get_statprofile_sample(void) +{ + return gc_statprofile_sample_rate / (double)(RAND_MAX); +} +JL_DLLEXPORT void jl_gc_set_statprofile_sample(double s) +{ + gc_statprofile_sample_rate = (int)(s * RAND_MAX); +} JL_DLLEXPORT int64_t jl_gc_diff_total_bytes(void) { @@ -2999,6 +3017,8 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) maybe_collect(ptls); gc_num.allocd += sz; gc_num.malloc++; + if(gc_statprofile_sample_rate && rand() < gc_statprofile_sample_rate) + jl_profile_record_trace(NULL); void *b = malloc(sz); if (b == NULL) jl_throw(jl_memory_exception); @@ -3011,6 +3031,8 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) maybe_collect(ptls); gc_num.allocd += nm*sz; gc_num.malloc++; + if(gc_statprofile_sample_rate && rand() < gc_statprofile_sample_rate) + jl_profile_record_trace(NULL); void *b = calloc(nm, sz); if (b == NULL) jl_throw(jl_memory_exception); @@ -3039,6 +3061,8 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size else gc_num.allocd += (sz - old); gc_num.realloc++; + if(gc_statprofile_sample_rate && rand() < gc_statprofile_sample_rate) + jl_profile_record_trace(NULL); void *b = realloc(p, sz); if (b == NULL) jl_throw(jl_memory_exception); @@ -3100,6 +3124,8 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz) jl_throw(jl_memory_exception); gc_num.allocd += allocsz; gc_num.malloc++; + if(gc_statprofile_sample_rate && rand() < gc_statprofile_sample_rate) + jl_profile_record_trace(NULL); void *b = malloc_cache_align(allocsz); if (b == NULL) jl_throw(jl_memory_exception); @@ -3125,6 +3151,8 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds else gc_num.allocd += (allocsz - oldsz); gc_num.realloc++; + if(gc_statprofile_sample_rate && rand() < gc_statprofile_sample_rate) + jl_profile_record_trace(NULL); void *b; if (isaligned) diff --git a/src/gc.h b/src/gc.h index 2be22aa782408..bb89a77373f63 100644 --- a/src/gc.h +++ b/src/gc.h @@ -655,6 +655,9 @@ static inline void gc_scrub(void) } #endif +double jl_gc_get_statprofile_sample(void); +void jl_gc_set_statprofile_sample(double); + #ifdef OBJPROFILE void objprofile_count(void *ty, int old, int sz) JL_NOTSAFEPOINT; void objprofile_printall(void); diff --git a/src/signals-unix.c b/src/signals-unix.c index 274277d32c845..0ef08d3166b69 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -550,6 +550,43 @@ static void kqueue_signal(int *sigqueue, struct kevent *ev, int sig) } #endif +int jl_profile_record_trace(bt_context_t *ctx) +{ + if (running) { + ucontext_t current_ctx; + if (NULL == ctx) { + getcontext(¤t_ctx); + ctx = jl_to_bt_context(¤t_ctx); + } + if (bt_size_cur < bt_size_max - 1) { + // unwinding can fail, so keep track of the current state + // and restore from the SEGV handler if anything happens. + jl_ptls_t ptls = jl_get_ptls_states(); + jl_jmp_buf *old_buf = ptls->safe_restore; + jl_jmp_buf buf; + + ptls->safe_restore = &buf; + if (jl_setjmp(buf, 0)) { + jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n"); + } else { + // Get backtrace data + bt_size_cur += rec_backtrace_ctx((uintptr_t*)bt_data_prof + bt_size_cur, + bt_size_max - bt_size_cur - 1, ctx); + } + ptls->safe_restore = old_buf; + + // Mark the end of this block with 0 + bt_data_prof[bt_size_cur++] = 0; + + return 0; + } + if (bt_size_cur >= bt_size_max - 1) { + return 1; + } + } + return 0; +} + static void *signal_listener(void *arg) { static uintptr_t bt_data[JL_MAX_BT_SIZE + 1]; @@ -674,31 +711,9 @@ static void *signal_listener(void *arg) } // do backtrace for profiler - if (profile && running) { - if (bt_size_cur < bt_size_max - 1) { - // unwinding can fail, so keep track of the current state - // and restore from the SEGV handler if anything happens. - jl_ptls_t ptls = jl_get_ptls_states(); - jl_jmp_buf *old_buf = ptls->safe_restore; - jl_jmp_buf buf; - - ptls->safe_restore = &buf; - if (jl_setjmp(buf, 0)) { - jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n"); - } else { - // Get backtrace data - bt_size_cur += rec_backtrace_ctx((uintptr_t*)bt_data_prof + bt_size_cur, - bt_size_max - bt_size_cur - 1, signal_context); - } - ptls->safe_restore = old_buf; - - // Mark the end of this block with 0 - bt_data_prof[bt_size_cur++] = 0; - } - if (bt_size_cur >= bt_size_max - 1) { - // Buffer full: Delete the timer - jl_profile_stop_timer(); - } + if(profile && signal_context != NULL && jl_profile_record_trace(signal_context)) { + // Buffer full: Delete the timer + jl_profile_stop_timer(); } // notify thread to resume diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index ffcc6ad10b05a..2ba5ed6efd04b 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -42,7 +42,7 @@ line of code; backtraces generally consist of a long list of instruction pointer settings can be obtained by calling this function with no arguments, and each can be set independently using keywords or in the order `(n, delay)`. """ -function init(; n::Union{Nothing,Integer} = nothing, delay::Union{Nothing,Real} = nothing) +function init(; n::Union{Nothing,Integer} = nothing, delay::Union{Nothing,Real} = nothing, alloc_rate::Real=0) n_cur = ccall(:jl_profile_maxlen_data, Csize_t, ()) delay_cur = ccall(:jl_profile_delay_nsec, UInt64, ())/10^9 if n === nothing && delay === nothing @@ -53,11 +53,12 @@ function init(; n::Union{Nothing,Integer} = nothing, delay::Union{Nothing,Real} init(nnew, delaynew) end -function init(n::Integer, delay::Real) +function init(n::Integer, delay::Real, alloc_rate::Real=0) status = ccall(:jl_profile_init, Cint, (Csize_t, UInt64), n, round(UInt64,10^9*delay)) if status == -1 error("could not allocate space for ", n, " instruction pointers") end + Base.gc_set_statprofile_sample!(alloc_rate) end # init with default values From 8aa4737ca1ee9283a6cf063a0c6f5882e5c8fbb9 Mon Sep 17 00:00:00 2001 From: Timo Kluck Date: Fri, 3 May 2019 21:18:12 +0200 Subject: [PATCH 2/2] Profile.init: pass keyword parameter onwards This was an oversight in the previous commit. --- stdlib/Profile/src/Profile.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index 2ba5ed6efd04b..fa1964ed06305 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -50,7 +50,7 @@ function init(; n::Union{Nothing,Integer} = nothing, delay::Union{Nothing,Real} end nnew = (n === nothing) ? n_cur : n delaynew = (delay === nothing) ? delay_cur : delay - init(nnew, delaynew) + init(nnew, delaynew, alloc_rate) end function init(n::Integer, delay::Real, alloc_rate::Real=0)