Skip to content

WIP: statistical allocation profiling #31915

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions base/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ end

gc_num() = ccall(:jl_gc_num, GC_Num, ())

# TODO(tkluck): not sure if it's necessary to expose these; we can alternatively just use
# ccall in the Profile module.
gc_get_statprofile_sample() = ccall(:jl_gc_get_statprofile_sample, Cdouble, ())
gc_set_statprofile_sample!(n) = ccall(:jl_gc_set_statprofile_sample, Cvoid, (Cdouble,), n)

# This type is to represent differences in the counters, so fields may be negative
struct GC_Diff
allocd ::Int64 # Bytes allocated
Expand Down
28 changes: 28 additions & 0 deletions src/gc.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// This file is a part of Julia. License is MIT: https://julialang.org/license

#include <stdlib.h> // for rand

#include "gc.h"
#include "julia_gcext.h"
#include "julia_assert.h"
Expand All @@ -8,6 +10,9 @@
extern "C" {
#endif

// declaration rom signals-unix.c
int jl_profile_record_trace(bt_context_t *ctx);

// Linked list of callback functions

typedef void (*jl_gc_cb_func_t)(void);
Expand Down Expand Up @@ -165,6 +170,7 @@ static int support_conservative_marking = 0;

jl_gc_num_t gc_num = {0,0,0,0,0,0,0,0,0,0,0,0,0,0};
static size_t last_long_collect_interval;
static int gc_statprofile_sample_rate = 0;

pagetable_t memory_map;

Expand Down Expand Up @@ -882,6 +888,8 @@ JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz)
gc_num.allocd += allocsz;
#endif
gc_num.bigalloc++;
if(gc_statprofile_sample_rate && rand() < gc_statprofile_sample_rate)
jl_profile_record_trace(NULL);
#ifdef MEMDEBUG
memset(v, 0xee, allocsz);
#endif
Expand Down Expand Up @@ -1108,6 +1116,8 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset,
jl_gc_safepoint_(ptls);
}
gc_num.poolalloc++;
if(gc_statprofile_sample_rate && rand() < gc_statprofile_sample_rate)
Copy link
Contributor

@chethega chethega May 4, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Following @timholy's comment on adjustible overhead: This implementation calls the RNG on every alloc. Hence, even if the sample rate is close to zero, the overhead does not converge to zero.

An alternative would be something like
if(gc_num.poolalloc++ == gc_num.next_pool_sample) {gc_num.next_pool_sample += gc_statprofile_pool_inverse_rate; jl_profile_record_trace(NULL);}.

With gc_num.next_pool_sample = 0, this would trigger on next wrap-around, i.e. never, and with gc_statprofile_pool_inverse_rate large this would trigger very rarely. We would pay only a single predicted branch on allocs we don't want to sample.

Similar treatment could be applied to gc_num.bigalloc, gc_num.allocd, etc counters. We probably should randomize the increment in order to avoid biases in loops that have period close to commensurable with the inverse rate. While poisson distribution of the gaps (as your code provides) is statistically nicer, something like 1 + (inverse_rate * rand_uint16()) >> 15 is probably good enough.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a great point. I'll run some timings to see how RNG overhead compares to the allocation itself. If it's significant, I'll investigate the right scheme to use here. If not, there's probably value in keeping Poisson.

jl_profile_record_trace(NULL);
// first try to use the freelist
jl_taggedvalue_t *v = p->freelist;
if (v) {
Expand Down Expand Up @@ -2613,6 +2623,14 @@ JL_DLLEXPORT jl_gc_num_t jl_gc_num(void)
{
return gc_num;
}
JL_DLLEXPORT double jl_gc_get_statprofile_sample(void)
{
return gc_statprofile_sample_rate / (double)(RAND_MAX);
}
JL_DLLEXPORT void jl_gc_set_statprofile_sample(double s)
{
gc_statprofile_sample_rate = (int)(s * RAND_MAX);
}

JL_DLLEXPORT int64_t jl_gc_diff_total_bytes(void)
{
Expand Down Expand Up @@ -2999,6 +3017,8 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
maybe_collect(ptls);
gc_num.allocd += sz;
gc_num.malloc++;
if(gc_statprofile_sample_rate && rand() < gc_statprofile_sample_rate)
jl_profile_record_trace(NULL);
void *b = malloc(sz);
if (b == NULL)
jl_throw(jl_memory_exception);
Expand All @@ -3011,6 +3031,8 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
maybe_collect(ptls);
gc_num.allocd += nm*sz;
gc_num.malloc++;
if(gc_statprofile_sample_rate && rand() < gc_statprofile_sample_rate)
jl_profile_record_trace(NULL);
void *b = calloc(nm, sz);
if (b == NULL)
jl_throw(jl_memory_exception);
Expand Down Expand Up @@ -3039,6 +3061,8 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
else
gc_num.allocd += (sz - old);
gc_num.realloc++;
if(gc_statprofile_sample_rate && rand() < gc_statprofile_sample_rate)
jl_profile_record_trace(NULL);
void *b = realloc(p, sz);
if (b == NULL)
jl_throw(jl_memory_exception);
Expand Down Expand Up @@ -3100,6 +3124,8 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
jl_throw(jl_memory_exception);
gc_num.allocd += allocsz;
gc_num.malloc++;
if(gc_statprofile_sample_rate && rand() < gc_statprofile_sample_rate)
jl_profile_record_trace(NULL);
void *b = malloc_cache_align(allocsz);
if (b == NULL)
jl_throw(jl_memory_exception);
Expand All @@ -3125,6 +3151,8 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds
else
gc_num.allocd += (allocsz - oldsz);
gc_num.realloc++;
if(gc_statprofile_sample_rate && rand() < gc_statprofile_sample_rate)
jl_profile_record_trace(NULL);

void *b;
if (isaligned)
Expand Down
3 changes: 3 additions & 0 deletions src/gc.h
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,9 @@ static inline void gc_scrub(void)
}
#endif

double jl_gc_get_statprofile_sample(void);
void jl_gc_set_statprofile_sample(double);

#ifdef OBJPROFILE
void objprofile_count(void *ty, int old, int sz) JL_NOTSAFEPOINT;
void objprofile_printall(void);
Expand Down
65 changes: 40 additions & 25 deletions src/signals-unix.c
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,43 @@ static void kqueue_signal(int *sigqueue, struct kevent *ev, int sig)
}
#endif

int jl_profile_record_trace(bt_context_t *ctx)
{
if (running) {
ucontext_t current_ctx;
if (NULL == ctx) {
getcontext(&current_ctx);
ctx = jl_to_bt_context(&current_ctx);
}
if (bt_size_cur < bt_size_max - 1) {
// unwinding can fail, so keep track of the current state
// and restore from the SEGV handler if anything happens.
jl_ptls_t ptls = jl_get_ptls_states();
jl_jmp_buf *old_buf = ptls->safe_restore;
jl_jmp_buf buf;

ptls->safe_restore = &buf;
if (jl_setjmp(buf, 0)) {
jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n");
} else {
// Get backtrace data
bt_size_cur += rec_backtrace_ctx((uintptr_t*)bt_data_prof + bt_size_cur,
bt_size_max - bt_size_cur - 1, ctx);
}
ptls->safe_restore = old_buf;

// Mark the end of this block with 0
bt_data_prof[bt_size_cur++] = 0;

return 0;
}
if (bt_size_cur >= bt_size_max - 1) {
return 1;
}
}
return 0;
}

static void *signal_listener(void *arg)
{
static uintptr_t bt_data[JL_MAX_BT_SIZE + 1];
Expand Down Expand Up @@ -674,31 +711,9 @@ static void *signal_listener(void *arg)
}

// do backtrace for profiler
if (profile && running) {
if (bt_size_cur < bt_size_max - 1) {
// unwinding can fail, so keep track of the current state
// and restore from the SEGV handler if anything happens.
jl_ptls_t ptls = jl_get_ptls_states();
jl_jmp_buf *old_buf = ptls->safe_restore;
jl_jmp_buf buf;

ptls->safe_restore = &buf;
if (jl_setjmp(buf, 0)) {
jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n");
} else {
// Get backtrace data
bt_size_cur += rec_backtrace_ctx((uintptr_t*)bt_data_prof + bt_size_cur,
bt_size_max - bt_size_cur - 1, signal_context);
}
ptls->safe_restore = old_buf;

// Mark the end of this block with 0
bt_data_prof[bt_size_cur++] = 0;
}
if (bt_size_cur >= bt_size_max - 1) {
// Buffer full: Delete the timer
jl_profile_stop_timer();
}
if(profile && signal_context != NULL && jl_profile_record_trace(signal_context)) {
// Buffer full: Delete the timer
jl_profile_stop_timer();
}

// notify thread to resume
Expand Down
7 changes: 4 additions & 3 deletions stdlib/Profile/src/Profile.jl
Original file line number Diff line number Diff line change
Expand Up @@ -42,22 +42,23 @@ line of code; backtraces generally consist of a long list of instruction pointer
settings can be obtained by calling this function with no arguments, and each can be set
independently using keywords or in the order `(n, delay)`.
"""
function init(; n::Union{Nothing,Integer} = nothing, delay::Union{Nothing,Real} = nothing)
function init(; n::Union{Nothing,Integer} = nothing, delay::Union{Nothing,Real} = nothing, alloc_rate::Real=0)
n_cur = ccall(:jl_profile_maxlen_data, Csize_t, ())
delay_cur = ccall(:jl_profile_delay_nsec, UInt64, ())/10^9
if n === nothing && delay === nothing
return Int(n_cur), delay_cur
end
nnew = (n === nothing) ? n_cur : n
delaynew = (delay === nothing) ? delay_cur : delay
init(nnew, delaynew)
init(nnew, delaynew, alloc_rate)
end

function init(n::Integer, delay::Real)
function init(n::Integer, delay::Real, alloc_rate::Real=0)
status = ccall(:jl_profile_init, Cint, (Csize_t, UInt64), n, round(UInt64,10^9*delay))
if status == -1
error("could not allocate space for ", n, " instruction pointers")
end
Base.gc_set_statprofile_sample!(alloc_rate)
end

# init with default values
Expand Down