Skip to content

Commit e72d835

Browse files
authored
Merge pull request ocaml#12923 from NickBarnes/nick-11911-statmemprof-rebase
Statmemprof resurrected
2 parents 18c4510 + 62d04fa commit e72d835

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+3315
-1374
lines changed

Changes

+11-5
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@ _______________
2929
(Kate Deplaix and Oscar Butler-Aldridge review by Nicolás Ojeda Bär,
3030
Craig Ferguson and Gabriel Scherer)
3131

32+
- #11911, #12923: Multicore statistical memory profiling (Nick Barnes,
33+
review by Stephen Dolan, Gabriel Scherer, Jacques-Henri Jourdan,
34+
Guillaume Munch-Maccagnoni).
35+
3236
### Other libraries:
3337

3438
### Tools:
@@ -282,8 +286,9 @@ OCaml 5.2.0
282286
review by Damien Doligez, Sébastien Hinderer, Jacques-Henri Jourdan, Luc
283287
Maranget, Guillaume Munch-Maccagnoni, Gabriel Scherer)
284288

285-
- #11911, #12381: Restore statmemprof functionality in part
286-
(API changes in Gc.Memprof). (Nick Barnes)
289+
- #11911, #12381: Restore statmemprof functionality in part (API
290+
changes in Gc.Memprof). (Nick Barnes, review by Jacques-Henri
291+
Jourdan and Guillaume Munch-Maccagnoni).
287292

288293
- #12430: Simplify dynamic bytecode loading in Meta.reify_bytecode
289294
(Stephen Dolan, review by Sébastien Hinderer, Vincent Laviron and Xavier
@@ -342,9 +347,10 @@ OCaml 5.2.0
342347
Ojeda Bar)
343348

344349
- #11911, #12382, #12383: Restore statmemprof functionality in part
345-
(backtrace buffers, per-thread and per-domain data structures).
346-
(Nick Barnes, review by Gabriel Scherer, Fabrice Buoro, Sadiq
347-
Jaffer, and Guillaume Munch-Maccagnoni).
350+
(backtrace buffers, per-thread and per-domain data structures,
351+
GC/allocation interface). (Nick Barnes, review by Gabriel Scherer,
352+
Fabrice Buoro, Sadiq Jaffer, Guillaume Munch-Maccagnoni, and
353+
Jacques-Henri Jourdan).
348354

349355
- #12735: Store both ends of the stack chain in continuations
350356
(Leo White, review by Miod Vallat and KC Sivaramakrishnan)

otherlibs/runtime_events/runtime_events.ml

+8
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,12 @@ type runtime_phase =
4444
| EV_MAJOR
4545
| EV_MAJOR_SWEEP
4646
| EV_MAJOR_MARK_ROOTS
47+
| EV_MAJOR_MEMPROF_ROOTS
4748
| EV_MAJOR_MARK
4849
| EV_MINOR
4950
| EV_MINOR_LOCAL_ROOTS
51+
| EV_MINOR_MEMPROF_ROOTS
52+
| EV_MINOR_MEMPROF_CLEAN
5053
| EV_MINOR_FINALIZED
5154
| EV_EXPLICIT_GC_MAJOR_SLICE
5255
| EV_FINALISE_UPDATE_FIRST
@@ -69,6 +72,7 @@ type runtime_phase =
6972
| EV_STW_HANDLER
7073
| EV_STW_LEADER
7174
| EV_MAJOR_FINISH_SWEEPING
75+
| EV_MAJOR_MEMPROF_CLEAN
7276
| EV_MINOR_FINALIZERS_ADMIN
7377
| EV_MINOR_REMEMBERED_SET
7478
| EV_MINOR_REMEMBERED_SET_PROMOTE
@@ -129,9 +133,12 @@ let runtime_phase_name phase =
129133
| EV_MAJOR -> "major"
130134
| EV_MAJOR_SWEEP -> "major_sweep"
131135
| EV_MAJOR_MARK_ROOTS -> "major_mark_roots"
136+
| EV_MAJOR_MEMPROF_ROOTS -> "major_memprof_roots"
132137
| EV_MAJOR_MARK -> "major_mark"
133138
| EV_MINOR -> "minor"
134139
| EV_MINOR_LOCAL_ROOTS -> "minor_local_roots"
140+
| EV_MINOR_MEMPROF_ROOTS -> "minor_memprof_roots"
141+
| EV_MINOR_MEMPROF_CLEAN -> "minor_memprof_clean"
135142
| EV_MINOR_FINALIZED -> "minor_finalized"
136143
| EV_EXPLICIT_GC_MAJOR_SLICE -> "explicit_gc_major_slice"
137144
| EV_FINALISE_UPDATE_FIRST -> "finalise_update_first"
@@ -153,6 +160,7 @@ let runtime_phase_name phase =
153160
| EV_STW_HANDLER -> "stw_handler"
154161
| EV_STW_LEADER -> "stw_leader"
155162
| EV_MAJOR_FINISH_SWEEPING -> "major_finish_sweeping"
163+
| EV_MAJOR_MEMPROF_CLEAN -> "major_memprof_clean"
156164
| EV_MINOR_FINALIZERS_ADMIN -> "minor_finalizers_admin"
157165
| EV_MINOR_REMEMBERED_SET -> "minor_remembered_set"
158166
| EV_MINOR_REMEMBERED_SET_PROMOTE -> "minor_remembered_set_promote"

otherlibs/runtime_events/runtime_events.mli

+4
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,12 @@ type runtime_phase =
9999
| EV_MAJOR
100100
| EV_MAJOR_SWEEP
101101
| EV_MAJOR_MARK_ROOTS
102+
| EV_MAJOR_MEMPROF_ROOTS
102103
| EV_MAJOR_MARK
103104
| EV_MINOR
104105
| EV_MINOR_LOCAL_ROOTS
106+
| EV_MINOR_MEMPROF_ROOTS
107+
| EV_MINOR_MEMPROF_CLEAN
105108
| EV_MINOR_FINALIZED
106109
| EV_EXPLICIT_GC_MAJOR_SLICE
107110
| EV_FINALISE_UPDATE_FIRST
@@ -124,6 +127,7 @@ type runtime_phase =
124127
| EV_STW_HANDLER
125128
| EV_STW_LEADER
126129
| EV_MAJOR_FINISH_SWEEPING
130+
| EV_MAJOR_MEMPROF_CLEAN
127131
| EV_MINOR_FINALIZERS_ADMIN
128132
| EV_MINOR_REMEMBERED_SET
129133
| EV_MINOR_REMEMBERED_SET_PROMOTE

runtime/caml/memory.h

+3
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,9 @@ enum caml_alloc_small_flags {
200200
#define Alloc_small_enter_GC(dom_st, wosize) \
201201
Alloc_small_enter_GC_flags(CAML_DO_TRACK | CAML_FROM_C, dom_st, wosize)
202202

203+
#define Alloc_small_enter_GC_no_track(dom_st, wosize) \
204+
Alloc_small_enter_GC_flags(CAML_DONT_TRACK | CAML_FROM_C, dom_st, wosize)
205+
203206
#define Alloc_small_with_reserved(result, wosize, tag, GC, reserved) do{ \
204207
CAMLassert ((wosize) >= 1); \
205208
CAMLassert ((tag_t) (tag) < 256); \

runtime/caml/memprof.h

+112-6
Original file line numberDiff line numberDiff line change
@@ -22,25 +22,131 @@
2222
#include "mlvalues.h"
2323
#include "roots.h"
2424

25-
/* Suspend or unsuspend profiling */
25+
/*** Sample allocations ***/
26+
27+
/* [Gc.Memprof.allocation_source] */
28+
29+
enum { CAML_MEMPROF_SRC_NORMAL = 0,
30+
CAML_MEMPROF_SRC_MARSHAL = 1, /* interning */
31+
CAML_MEMPROF_SRC_CUSTOM = 2 /* custom memory */ };
32+
33+
/* Respond to the allocation of any block. Does not call callbacks.
34+
* `block` is the allocated block, to be tracked by memprof if
35+
* sampled. `allocated_words` is the number of words allocated, to be
36+
* passed to the allocation callback. `sampled_words` is the number of
37+
* words to use when computing the number of samples (this will
38+
* normally be one more than `allocated words` due to the header word,
39+
* but may not be for out-of-heap memory). `source` is one of the
40+
* `CAML_MEMPROF_SRC_* constants above. */
41+
42+
void caml_memprof_sample_block(value block, size_t allocated_words,
43+
size_t sampled_words, int source);
44+
45+
/* Sample a minor heap "Comballoc" (combined allocation). Called when
46+
* the memprof trigger is hit (before the allocation is actually
47+
* performed, which may require a GC). `allocs` and `alloc_lens`
48+
* describe the combined allocation. Runs allocation callbacks. */
49+
50+
extern void caml_memprof_sample_young(uintnat wosize, int from_caml,
51+
int allocs, unsigned char* alloc_lens);
52+
53+
/* Suspend or unsuspend sampling (for the current thread). */
54+
2655
extern void caml_memprof_update_suspended(_Bool);
2756

28-
/* Freshly set sampling point on minor heap */
29-
extern void caml_memprof_renew_minor_sample(caml_domain_state *state);
3057

31-
/* Multi-domain support. */
58+
/*** GC interface ***/
59+
60+
/* Apply `f(fdata, r, &r)` to each GC root `r` within memprof data
61+
* structures for the domain `state`.
62+
*
63+
* `fflags` is used to decide whether to only scan roots which may
64+
* point to minor heaps (the `SCANNING_ONLY_YOUNG_VALUES` flag).
65+
*
66+
* If `weak` is false then only scan strong roots. If `weak`
67+
* is true then also scan weak roots.
68+
*
69+
* If `global` is false then only scan roots for `state`. If `global`
70+
* is true then also scan roots shared between all domains. */
71+
72+
extern void caml_memprof_scan_roots(scanning_action f,
73+
scanning_action_flags fflags,
74+
void* fdata,
75+
caml_domain_state *state,
76+
_Bool weak,
77+
_Bool global);
78+
79+
/* Update memprof data structures for the domain `state`, to reflect
80+
* survival and promotion, after a minor GC is completed.
81+
*
82+
* If `global` is false then only update structures for `state`. If
83+
* `global` is true then also update structures shared between all
84+
* domains. */
85+
86+
extern void caml_memprof_after_minor_gc(caml_domain_state *state, _Bool global);
87+
88+
/* Update memprof data structures for the domain `state`, to reflect
89+
* survival, after a minor GC is completed.
90+
*
91+
* If `global` is false then only update structures for `state`. If
92+
* `global` is true then also update structures shared between all
93+
* domains. */
94+
95+
extern void caml_memprof_after_major_gc(caml_domain_state *state, _Bool global);
96+
97+
/* Freshly computes state->memprof_young_trigger. *Does not* set the
98+
* young limit. */
99+
100+
extern void caml_memprof_set_trigger(caml_domain_state *state);
101+
102+
/*** Callbacks ***/
103+
104+
/* Run any pending callbacks for the current domain (or adopted from a
105+
* terminated domain). */
106+
107+
extern value caml_memprof_run_callbacks_exn(void);
108+
109+
110+
/*** Multi-domain support. ***/
111+
112+
/* Notify memprof of the creation of a new domain `domain`. If there
113+
* was an existing domain (from which to inherit profiling behaviour),
114+
* it is passed in `parent`. Called before the new domain allocates
115+
* anything, and before the parent domain continues. Also creates
116+
* memprof thread state for the initial thread of the domain. */
32117

33118
extern void caml_memprof_new_domain(caml_domain_state *parent,
34119
caml_domain_state *domain);
120+
121+
/* Notify memprof that the domain `domain` is terminating. Called
122+
* after the last allocation by the domain. */
123+
35124
extern void caml_memprof_delete_domain(caml_domain_state *domain);
36125

37-
/* Multi-thread support */
126+
127+
/*** Multi-thread support ***/
128+
129+
/* Opaque type of memprof state for a single thread. */
38130

39131
typedef struct memprof_thread_s *memprof_thread_t;
40132

41-
CAMLextern memprof_thread_t caml_memprof_main_thread(caml_domain_state *domain);
133+
/* Notify memprof that a new thread is being created. Returns a
134+
* pointer to memprof state for the new thread. */
135+
42136
CAMLextern memprof_thread_t caml_memprof_new_thread(caml_domain_state *domain);
137+
138+
/* Obtain the memprof state for the initial thread of a domain. Called
139+
* when there is only one such thread. */
140+
141+
CAMLextern memprof_thread_t caml_memprof_main_thread(caml_domain_state *domain);
142+
143+
/* Notify memprof that the current domain is switching to the given
144+
* thread. */
145+
43146
CAMLextern void caml_memprof_enter_thread(memprof_thread_t);
147+
148+
/* Notify memprof that the given thread is being deleted. */
149+
44150
CAMLextern void caml_memprof_delete_thread(memprof_thread_t);
45151

46152
#endif

runtime/caml/runtime_events.h

+4
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,12 @@ typedef enum {
8080
EV_MAJOR,
8181
EV_MAJOR_SWEEP,
8282
EV_MAJOR_MARK_ROOTS,
83+
EV_MAJOR_MEMPROF_ROOTS,
8384
EV_MAJOR_MARK,
8485
EV_MINOR,
8586
EV_MINOR_LOCAL_ROOTS,
87+
EV_MINOR_MEMPROF_ROOTS,
88+
EV_MINOR_MEMPROF_CLEAN,
8689
EV_MINOR_FINALIZED,
8790
EV_EXPLICIT_GC_MAJOR_SLICE,
8891
EV_FINALISE_UPDATE_FIRST,
@@ -105,6 +108,7 @@ typedef enum {
105108
EV_STW_HANDLER,
106109
EV_STW_LEADER,
107110
EV_MAJOR_FINISH_SWEEPING,
111+
EV_MAJOR_MEMPROF_CLEAN,
108112
EV_MINOR_FINALIZERS_ADMIN,
109113
EV_MINOR_REMEMBERED_SET,
110114
EV_MINOR_REMEMBERED_SET_PROMOTE,

runtime/custom.c

+4-1
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,10 @@ CAMLexport value caml_alloc_custom_mem(const struct custom_operations * ops,
112112
uintnat bsz,
113113
mlsize_t mem)
114114
{
115-
return alloc_custom_gen (ops, bsz, mem, 0, get_max_minor());
115+
value v = alloc_custom_gen (ops, bsz, mem, 0, get_max_minor());
116+
size_t mem_words = (mem + sizeof(value) - 1) / sizeof(value);
117+
caml_memprof_sample_block(v, mem_words, mem_words, CAML_MEMPROF_SRC_CUSTOM);
118+
return v;
116119
}
117120

118121
struct custom_operations_list {

runtime/domain.c

+5-3
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,7 @@ static void free_minor_heap(void) {
469469
domain_state->young_end = NULL;
470470
domain_state->young_ptr = NULL;
471471
domain_state->young_trigger = NULL;
472+
domain_state->memprof_young_trigger = NULL;
472473
atomic_store_release(&domain_state->young_limit,
473474
(uintnat) domain_state->young_start);
474475
}
@@ -511,7 +512,7 @@ static int allocate_minor_heap(asize_t wsize) {
511512
* major slice is scheduled. */
512513
domain_state->young_trigger = domain_state->young_start
513514
+ (domain_state->young_end - domain_state->young_start) / 2;
514-
caml_memprof_renew_minor_sample(domain_state);
515+
caml_memprof_set_trigger(domain_state);
515516
caml_reset_young_limit(domain_state);
516517

517518
check_minor_heap();
@@ -620,14 +621,16 @@ static void domain_create(uintnat initial_minor_heap_wsize,
620621
atomic_store_explicit(&s->interrupt_word, &domain_state->young_limit,
621622
memory_order_release);
622623

624+
domain_state->id = d->id;
625+
623626
/* Tell memprof system about the new domain before either (a) new
624627
* domain can allocate anything or (b) parent domain can go away. */
628+
CAMLassert(domain_state->memprof == NULL);
625629
caml_memprof_new_domain(parent, domain_state);
626630
if (!domain_state->memprof) {
627631
goto init_memprof_failure;
628632
}
629633

630-
domain_state->id = d->id;
631634
CAMLassert(!s->interrupt_pending);
632635

633636
domain_state->extra_heap_resources = 0.0;
@@ -728,7 +731,6 @@ static void domain_create(uintnat initial_minor_heap_wsize,
728731
domain_state->trap_barrier_block = -1;
729732
#endif
730733

731-
caml_reset_young_limit(domain_state);
732734
add_next_to_stw_domains();
733735
goto domain_init_complete;
734736

runtime/intern.c

+8-1
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,9 @@ static void intern_alloc_storage(struct caml_intern_state* s, mlsize_t whsize,
396396
wosize = Wosize_whsize(whsize);
397397

398398
if (wosize <= Max_young_wosize && wosize != 0) {
399-
v = caml_alloc_small (wosize, String_tag);
399+
/* don't track bulk allocation in minor heap with statmemprof;
400+
* individual block allocations are tracked instead */
401+
Alloc_small(v, wosize, String_tag, Alloc_small_enter_GC_no_track);
400402
s->intern_dest = (header_t *) Hp_val(v);
401403
} else {
402404
CAMLassert (s->intern_dest == NULL);
@@ -426,6 +428,8 @@ static value intern_alloc_obj(struct caml_intern_state* s, caml_domain_state* d,
426428
(value*)s->intern_dest < d->young_end);
427429
p = s->intern_dest;
428430
*s->intern_dest = Make_header (wosize, tag, 0);
431+
caml_memprof_sample_block(Val_hp(p), wosize, 1 + wosize,
432+
CAML_MEMPROF_SRC_MARSHAL);
429433
s->intern_dest += 1 + wosize;
430434
} else {
431435
p = caml_shared_try_alloc(d->shared_heap, wosize, tag,
@@ -436,6 +440,9 @@ static value intern_alloc_obj(struct caml_intern_state* s, caml_domain_state* d,
436440
caml_raise_out_of_memory();
437441
}
438442
Hd_hp(p) = Make_header (wosize, tag, caml_global_heap_state.MARKED);
443+
caml_memprof_sample_block(Val_hp(p), wosize,
444+
Whsize_wosize(wosize),
445+
CAML_MEMPROF_SRC_MARSHAL);
439446
}
440447
return Val_hp(p);
441448
}

runtime/major_gc.c

+16
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include "caml/globroots.h"
3232
#include "caml/gc_stats.h"
3333
#include "caml/memory.h"
34+
#include "caml/memprof.h"
3435
#include "caml/mlvalues.h"
3536
#include "caml/platform.h"
3637
#include "caml/roots.h"
@@ -1359,6 +1360,16 @@ static void stw_cycle_all_domains(caml_domain_state* domain, void* args,
13591360
because there's one in the minor gc and after. */
13601361
struct cycle_callback_params params = *((struct cycle_callback_params*)args);
13611362

1363+
/* TODO: Not clear this memprof work is really part of the "cycle"
1364+
* operation. It's more like ephemeron-cleaning really. An earlier
1365+
* version had a separate callback for this, but resulted in
1366+
* failures because using caml_try_run_on_all_domains() on it would
1367+
* mysteriously put all domains back into mark/sweep.
1368+
*/
1369+
CAML_EV_BEGIN(EV_MAJOR_MEMPROF_CLEAN);
1370+
caml_memprof_after_major_gc(domain, domain == participating[0]);
1371+
CAML_EV_END(EV_MAJOR_MEMPROF_CLEAN);
1372+
13621373
CAML_EV_BEGIN(EV_MAJOR_GC_CYCLE_DOMAINS);
13631374

13641375
CAMLassert(domain == Caml_state);
@@ -1524,6 +1535,11 @@ static void stw_cycle_all_domains(caml_domain_state* domain, void* args,
15241535
}
15251536
CAML_EV_END(EV_MAJOR_MARK_ROOTS);
15261537

1538+
CAML_EV_BEGIN(EV_MAJOR_MEMPROF_ROOTS);
1539+
caml_memprof_scan_roots(caml_darken, darken_scanning_flags, domain,
1540+
domain, false, participating[0] == Caml_state);
1541+
CAML_EV_END(EV_MAJOR_MEMPROF_ROOTS);
1542+
15271543
if (domain->mark_stack->count == 0 &&
15281544
!caml_addrmap_iter_ok(&domain->mark_stack->compressed_stack,
15291545
domain->mark_stack->compressed_stack_iter)

0 commit comments

Comments
 (0)