Skip to content

Commit f5e5a73

Browse files
gbaraldid-netto
authored andcommitted
Add fallback if we have make a weird GC decision. (#50682)
If something odd happens during GC (the PC goes to sleep) or a very big transient the heuristics might make a bad decision. What this PR implements is if we try to make our target more than double the one we had before we fallback to a more conservative method. This fixes the new issue @vtjnash found in #40644 for me.
1 parent a9f4c08 commit f5e5a73

File tree

5 files changed

+82
-22
lines changed

5 files changed

+82
-22
lines changed

Diff for: Make.inc

+6
Original file line numberDiff line numberDiff line change
@@ -1461,6 +1461,12 @@ endef
14611461
# Overridable in Make.user
14621462
WINE ?= wine
14631463

1464+
ifeq ($(BINARY),32)
1465+
HEAPLIM := --heap-size-hint=1000M
1466+
else
1467+
HEAPLIM :=
1468+
endif
1469+
14641470
# many of the following targets must be = not := because the expansion of the makefile functions (and $1) shouldn't happen until later
14651471
ifeq ($(BUILD_OS), WINNT) # MSYS
14661472
spawn = $(1)

Diff for: src/gc-debug.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -1410,17 +1410,17 @@ void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect
14101410
if (!gc_logging_enabled) {
14111411
return;
14121412
}
1413-
jl_safe_printf("GC: pause %.2fms. collected %fMB. %s %s\n",
1413+
jl_safe_printf("\nGC: pause %.2fms. collected %fMB. %s %s\n",
14141414
pause/1e6, freed/(double)(1<<20),
14151415
full ? "full" : "incr",
14161416
recollect ? "recollect" : ""
14171417
);
14181418

1419-
jl_safe_printf("Heap stats: bytes_mapped %.2f MB, heap_size %.2f MB, heap_target %.2f MB, live_bytes %.2f MB\n, Fragmentation %.3f",
1419+
jl_safe_printf("Heap stats: bytes_mapped %.2f MB,\nheap_size %.2f MB, heap_target %.2f MB, Fragmentation %.3f\n",
14201420
jl_atomic_load_relaxed(&gc_heap_stats.bytes_mapped)/(double)(1<<20),
1421+
// live_bytes/(double)(1<<20), live byes tracking is not accurate.
14211422
jl_atomic_load_relaxed(&gc_heap_stats.heap_size)/(double)(1<<20),
14221423
jl_atomic_load_relaxed(&gc_heap_stats.heap_target)/(double)(1<<20),
1423-
live_bytes/(double)(1<<20),
14241424
(double)live_bytes/(double)jl_atomic_load_relaxed(&gc_heap_stats.heap_size)
14251425
);
14261426
}

Diff for: src/gc.c

+64-17
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// This file is a part of Julia. License is MIT: https://julialang.org/license
22

33
#include "gc.h"
4+
#include "julia.h"
45
#include "julia_gcext.h"
56
#include "julia_assert.h"
67
#ifdef __GLIBC__
@@ -804,8 +805,8 @@ static uint64_t old_heap_size = 0;
804805
static uint64_t old_alloc_diff = 0;
805806
static uint64_t old_freed_diff = 0;
806807
static uint64_t gc_end_time = 0;
807-
808-
808+
static int thrash_counter = 0;
809+
static int thrashing = 0;
809810
// global variables for GC stats
810811

811812
// Resetting the object to a young object, this is used when marking the
@@ -1337,7 +1338,10 @@ static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT
13371338
dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_num.bigalloc);
13381339
uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_num.alloc_acc);
13391340
uint64_t free_acc = jl_atomic_load_relaxed(&ptls->gc_num.free_acc);
1341+
dest->freed += jl_atomic_load_relaxed(&ptls->gc_num.free_acc);
13401342
jl_atomic_store_relaxed(&gc_heap_stats.heap_size, alloc_acc - free_acc + jl_atomic_load_relaxed(&gc_heap_stats.heap_size));
1343+
jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, 0);
1344+
jl_atomic_store_relaxed(&ptls->gc_num.free_acc, 0);
13411345
}
13421346
}
13431347
}
@@ -3533,9 +3537,6 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
35333537
// If the live data outgrows the suggested max_total_memory
35343538
// we keep going with minimum intervals and full gcs until
35353539
// we either free some space or get an OOM error.
3536-
if (live_bytes > max_total_memory) {
3537-
sweep_full = 1;
3538-
}
35393540
if (gc_sweep_always_full) {
35403541
sweep_full = 1;
35413542
}
@@ -3569,8 +3570,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
35693570
uint64_t sweep_time = gc_end_time - start_sweep_time;
35703571
gc_num.total_sweep_time += sweep_time;
35713572
gc_num.sweep_time = sweep_time;
3572-
3573-
int thrashing = 0; // maybe we should report this to the user or error out?
3573+
35743574
size_t heap_size = jl_atomic_load_relaxed(&gc_heap_stats.heap_size);
35753575
double target_allocs = 0.0;
35763576
double min_interval = default_collect_interval;
@@ -3581,24 +3581,32 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
35813581
double collect_smooth_factor = 0.5;
35823582
double tuning_factor = 0.03;
35833583
double alloc_mem = jl_gc_smooth(old_alloc_diff, alloc_diff, alloc_smooth_factor);
3584-
double alloc_time = jl_gc_smooth(old_mut_time, mutator_time, alloc_smooth_factor);
3584+
double alloc_time = jl_gc_smooth(old_mut_time, mutator_time + sweep_time, alloc_smooth_factor); // Charge sweeping to the mutator
35853585
double gc_mem = jl_gc_smooth(old_freed_diff, freed_diff, collect_smooth_factor);
3586-
double gc_time = jl_gc_smooth(old_pause_time, pause, collect_smooth_factor);
3586+
double gc_time = jl_gc_smooth(old_pause_time, pause - sweep_time, collect_smooth_factor);
35873587
old_alloc_diff = alloc_diff;
35883588
old_mut_time = mutator_time;
35893589
old_freed_diff = freed_diff;
35903590
old_pause_time = pause;
3591-
old_heap_size = heap_size;
3592-
thrashing = gc_time > mutator_time * 98 ? 1 : 0;
3591+
old_heap_size = heap_size; // TODO: Update these values dynamically instead of just during the GC
3592+
if (gc_time > alloc_time * 95 && !(thrash_counter < 4))
3593+
thrash_counter += 1;
3594+
else if (thrash_counter > 0)
3595+
thrash_counter -= 1;
35933596
if (alloc_mem != 0 && alloc_time != 0 && gc_mem != 0 && gc_time != 0 ) {
35943597
double alloc_rate = alloc_mem/alloc_time;
35953598
double gc_rate = gc_mem/gc_time;
35963599
target_allocs = sqrt(((double)heap_size/min_interval * alloc_rate)/(gc_rate * tuning_factor)); // work on multiples of min interval
35973600
}
35983601
}
3599-
if (target_allocs == 0.0 || thrashing) // If we are thrashing go back to default
3600-
target_allocs = 2*sqrt((double)heap_size/min_interval);
3602+
if (thrashing == 0 && thrash_counter >= 3)
3603+
thrashing = 1;
3604+
else if (thrashing == 1 && thrash_counter <= 2)
3605+
thrashing = 0; // maybe we should report this to the user or error out?
36013606

3607+
int bad_result = (target_allocs*min_interval + heap_size) > 2 * jl_atomic_load_relaxed(&gc_heap_stats.heap_target); // Don't follow through on a bad decision
3608+
if (target_allocs == 0.0 || thrashing || bad_result) // If we are thrashing go back to default
3609+
target_allocs = 2*sqrt((double)heap_size/min_interval);
36023610
uint64_t target_heap = (uint64_t)target_allocs*min_interval + heap_size;
36033611
if (target_heap > max_total_memory && !thrashing) // Allow it to go over if we are thrashing if we die we die
36043612
target_heap = max_total_memory;
@@ -3852,10 +3860,10 @@ void jl_gc_init(void)
38523860
total_mem = uv_get_total_memory();
38533861
uint64_t constrained_mem = uv_get_constrained_memory();
38543862
if (constrained_mem > 0 && constrained_mem < total_mem)
3855-
total_mem = constrained_mem;
3863+
jl_gc_set_max_memory(constrained_mem - 250*1024*1024); // LLVM + other libraries need some amount of memory
38563864
#endif
38573865
if (jl_options.heap_size_hint)
3858-
jl_gc_set_max_memory(jl_options.heap_size_hint);
3866+
jl_gc_set_max_memory(jl_options.heap_size_hint - 250*1024*1024);
38593867

38603868
jl_gc_mark_sp_t sp = {NULL, NULL, NULL, NULL};
38613869
gc_mark_loop(NULL, sp);
@@ -3959,7 +3967,26 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
39593967
jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (sz - old));
39603968
jl_atomic_store_relaxed(&ptls->gc_num.realloc,
39613969
jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1);
3962-
jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz-old);
3970+
3971+
int64_t diff = sz - old;
3972+
if (diff < 0) {
3973+
uint64_t free_acc = jl_atomic_load_relaxed(&ptls->gc_num.free_acc);
3974+
if (free_acc + diff < 16*1024)
3975+
jl_atomic_store_relaxed(&ptls->gc_num.free_acc, free_acc + (-diff));
3976+
else {
3977+
jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_acc + (-diff)));
3978+
jl_atomic_store_relaxed(&ptls->gc_num.free_acc, 0);
3979+
}
3980+
}
3981+
else {
3982+
uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_num.alloc_acc);
3983+
if (alloc_acc + diff < 16*1024)
3984+
jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, alloc_acc + diff);
3985+
else {
3986+
jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_acc + diff);
3987+
jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, 0);
3988+
}
3989+
}
39633990
}
39643991
return realloc(p, sz);
39653992
}
@@ -4076,7 +4103,27 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds
40764103
jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (allocsz - oldsz));
40774104
jl_atomic_store_relaxed(&ptls->gc_num.realloc,
40784105
jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1);
4079-
jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz-oldsz);
4106+
4107+
int64_t diff = allocsz - oldsz;
4108+
if (diff < 0) {
4109+
uint64_t free_acc = jl_atomic_load_relaxed(&ptls->gc_num.free_acc);
4110+
if (free_acc + diff < 16*1024)
4111+
jl_atomic_store_relaxed(&ptls->gc_num.free_acc, free_acc + (-diff));
4112+
else {
4113+
jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_acc + (-diff)));
4114+
jl_atomic_store_relaxed(&ptls->gc_num.free_acc, 0);
4115+
}
4116+
}
4117+
else {
4118+
uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_num.alloc_acc);
4119+
if (alloc_acc + diff < 16*1024)
4120+
jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, alloc_acc + diff);
4121+
else {
4122+
jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_acc + diff);
4123+
jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, 0);
4124+
}
4125+
}
4126+
40804127
int last_errno = errno;
40814128
#ifdef _OS_WINDOWS_
40824129
DWORD last_error = GetLastError();

Diff for: test/cmdlineargs.jl

+2-2
Original file line numberDiff line numberDiff line change
@@ -858,6 +858,6 @@ end
858858
@test lines[3] == "foo"
859859
@test lines[4] == "bar"
860860
end
861-
#heap-size-hint
862-
@test readchomp(`$(Base.julia_cmd()) --startup-file=no --heap-size-hint=500M -e "println(@ccall jl_gc_get_max_memory()::UInt64)"`) == "524288000"
861+
#heap-size-hint, we reserve 250 MB for non GC memory (llvm, etc.)
862+
@test readchomp(`$(Base.julia_cmd()) --startup-file=no --heap-size-hint=500M -e "println(@ccall jl_gc_get_max_memory()::UInt64)"`) == "$((500-250)*1024*1024)"
863863
end

Diff for: test/testenv.jl

+7
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,13 @@ if !@isdefined(testenv_defined)
3737

3838
function addprocs_with_testenv(X; rr_allowed=true, kwargs...)
3939
exename = rr_allowed ? `$rr_exename $test_exename` : test_exename
40+
<<<<<<< HEAD
41+
=======
42+
if X isa Integer
43+
heap_size=round(Int,(Sys.free_memory()/(1024^2)/(X+1)))
44+
push!(test_exeflags.exec, "--heap-size-hint=$(heap_size)M")
45+
end
46+
>>>>>>> ab94fadc9f (Add fallback if we have make a weird GC decision. (#50682))
4047
addprocs(X; exename=exename, exeflags=test_exeflags, kwargs...)
4148
end
4249

0 commit comments

Comments
 (0)