Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Supporting sticky (generational) immix #57327

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ ifeq ($(JULIACODEGEN),LLVM)
GC_CODEGEN_SRCS := llvm-final-gc-lowering llvm-late-gc-lowering llvm-gc-invariant-verifier
ifeq (${USE_THIRD_PARTY_GC},mmtk)
FLAGS += -I$(MMTK_API_INC)
GC_CODEGEN_SRCS += llvm-late-gc-lowering-mmtk
GC_CODEGEN_SRCS += llvm-final-gc-lowering-mmtk
else
GC_CODEGEN_SRCS += llvm-late-gc-lowering-stock
GC_CODEGEN_SRCS += llvm-final-gc-lowering-stock
endif
CODEGEN_SRCS := codegen jitlayers aotcompile debuginfo disasm llvm-simdloop \
llvm-pass-helpers llvm-ptls llvm-propagate-addrspaces null_sysimage \
Expand Down
6 changes: 4 additions & 2 deletions src/gc-interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,10 @@ struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty, unsigned align) JL_NOTSAF
// The GC may use that information to, for instance, determine that such objects should
// be treated as marked and belonged to the old generation in nursery collections.
void jl_gc_notify_image_load(const char* img_data, size_t len);
// This function notifies the GC about memory addresses that are set when allocating the boot image.
// The GC may use that information to, for instance, determine that all objects in that chunk of memory should
// be treated as marked and belonged to the old generation in nursery collections.
void jl_gc_notify_image_alloc(const char* img_data, size_t len);

// ========================================================================= //
// Runtime Write-Barriers
Expand Down Expand Up @@ -252,13 +256,11 @@ STATIC_INLINE void jl_gc_wb_knownold(const void *parent JL_UNUSED, const void *p
// per field of the object being copied, but may be special-cased for performance reasons.
STATIC_INLINE void jl_gc_multi_wb(const void *parent,
const struct _jl_value_t *ptr) JL_NOTSAFEPOINT;

// Write-barrier function that must be used after copying fields of elements of genericmemory objects
// into another. It should be semantically equivalent to triggering multiple write barriers – one
// per field of the object being copied, but may be special-cased for performance reasons.
STATIC_INLINE void jl_gc_wb_genericmemory_copy_ptr(const struct _jl_value_t *owner, struct _jl_genericmemory_t *src, char* src_p,
size_t n, struct _jl_datatype_t *dt) JL_NOTSAFEPOINT;

// Similar to jl_gc_wb_genericmemory_copy but must be used when copying *boxed* elements of a genericmemory
// object. Note that this barrier also performs the copying unlike jl_gc_wb_genericmemory_copy_ptr.
// The parameters src_p, dest_p and n will be modified and will contain information about
Expand Down
29 changes: 25 additions & 4 deletions src/gc-mmtk.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "gc-common.h"
#include "gc-tls-mmtk.h"
#include "gc-wb-mmtk.h"
#include "mmtkMutator.h"
#include "threading.h"

Expand Down Expand Up @@ -861,10 +862,23 @@ STATIC_INLINE void* mmtk_immortal_alloc_fast(MMTkMutatorContext* mutator, size_t
return bump_alloc_fast(mutator, (uintptr_t*)&allocator->cursor, (uintptr_t)allocator->limit, size, align, offset, 1);
}

inline void mmtk_set_side_metadata(const void* side_metadata_base, void* obj) {
intptr_t addr = (intptr_t) obj;
uint8_t* meta_addr = (uint8_t*) side_metadata_base + (addr >> 6);
intptr_t shift = (addr >> 3) & 0b111;
while(1) {
uint8_t old_val = *meta_addr;
uint8_t new_val = old_val | (1 << shift);
if (jl_atomic_cmpswap((_Atomic(uint8_t)*)meta_addr, &old_val, new_val)) {
break;
}
}
}

STATIC_INLINE void mmtk_immortal_post_alloc_fast(MMTkMutatorContext* mutator, void* obj, size_t size) {
// FIXME: Similarly, for now, we do nothing
// but when supporting moving, this is where we set the valid object (VO) bit
// and log (old gen) bit
if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
mmtk_set_side_metadata(MMTK_SIDE_LOG_BIT_BASE_ADDRESS, obj);
}
}

JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int osize, size_t align, void *ty)
Expand Down Expand Up @@ -1081,6 +1095,11 @@ void jl_gc_notify_image_load(const char* img_data, size_t len)
mmtk_set_vm_space((void*)img_data, len);
}

void jl_gc_notify_image_alloc(const char* img_data, size_t len)
{
mmtk_immortal_region_post_alloc((void*)img_data, len);
}

// ========================================================================= //
// Code specific to stock that is not supported by MMTk
// ========================================================================= //
Expand Down Expand Up @@ -1128,7 +1147,9 @@ _Atomic(int) gc_stack_free_idx = 0;

JL_DLLEXPORT void jl_gc_queue_root(const struct _jl_value_t *ptr) JL_NOTSAFEPOINT
{
mmtk_unreachable();
jl_task_t *ct = jl_current_task;
jl_ptls_t ptls = ct->ptls;
mmtk_object_reference_write_slow(&ptls->gc_tls.mmtk_mutator, ptr, (const void*) 0);
}

JL_DLLEXPORT void jl_gc_queue_multiroot(const struct _jl_value_t *root, const void *stored,
Expand Down
5 changes: 5 additions & 0 deletions src/gc-stock.c
Original file line number Diff line number Diff line change
Expand Up @@ -4074,6 +4074,11 @@ void jl_gc_notify_image_load(const char* img_data, size_t len)
// Do nothing
}

void jl_gc_notify_image_alloc(const char* img_data, size_t len)
{
// Do nothing
}

JL_DLLEXPORT const char* jl_gc_active_impl(void) {
return "Built with stock GC";
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,31 @@

#include "llvm-gc-interface-passes.h"

Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
#define DEBUG_TYPE "mmtk_final_gc_lowering"
STATISTIC(GCAllocBytesCount, "Number of lowered GCAllocBytesFunc intrinsics");

Value* FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
{
assert(target->arg_size() == 3);
++GCAllocBytesCount;
CallInst *newI;

IRBuilder<> builder(target);
auto ptls = target->getArgOperand(0);
auto type = target->getArgOperand(2);
uint64_t derefBytes = 0;
if (auto CI = dyn_cast<ConstantInt>(target->getArgOperand(1))) {
size_t sz = (size_t)CI->getZExtValue();
// This is strongly architecture and OS dependent
int osize;
int offset = jl_gc_classify_pools(sz, &osize);
if (offset >= 0) {
if (offset < 0) {
newI = builder.CreateCall(
bigAllocFunc,
{ ptls, ConstantInt::get(T_size, sz + sizeof(void*)), type });
if (sz > 0)
derefBytes = sz;
}
else {
// In this case instead of lowering julia.gc_alloc_bytes to jl_gc_small_alloc
// We do a slowpath/fastpath check and lower it only on the slowpath, returning
// the cursor and updating it in the fastpath.
Expand Down Expand Up @@ -91,6 +103,76 @@ Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
return phiNode;
}
}
} else {
auto size = builder.CreateZExtOrTrunc(target->getArgOperand(1), T_size);
// allocTypedFunc does not include the type tag in the allocation size!
newI = builder.CreateCall(allocTypedFunc, { ptls, size, type });
derefBytes = sizeof(void*);
}
return target;
newI->setAttributes(newI->getCalledFunction()->getAttributes());
unsigned align = std::max((unsigned)target->getRetAlign().valueOrOne().value(), (unsigned)sizeof(void*));
newI->addRetAttr(Attribute::getWithAlignment(F.getContext(), Align(align)));
if (derefBytes > 0)
newI->addDereferenceableRetAttr(derefBytes);
newI->takeName(target);
return newI;
}


void FinalLowerGC::lowerWriteBarrier(CallInst *target, Function &F) {
State S(F);
auto parent = target->getArgOperand(0);
if (std::all_of(target->op_begin() + 1, target->op_end(),
[parent, &S](Value *child) { return parent == child || IsPermRooted(child, &S); })) {
return;
}

IRBuilder<> builder(target);
builder.SetCurrentDebugLocation(target->getDebugLoc());

// FIXME: Currently we call write barrier with the src object (parent).
// This works fine for object barrier for generational plans (such as stickyimmix), which does not use the target object at all.
// But for other MMTk plans, we need to be careful.
const bool INLINE_WRITE_BARRIER = true;
if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
if (INLINE_WRITE_BARRIER) {
auto i8_ty = Type::getInt8Ty(F.getContext());
auto intptr_ty = T_size;

// intptr_t addr = (intptr_t) (void*) src;
// uint8_t* meta_addr = (uint8_t*) (SIDE_METADATA_BASE_ADDRESS + (addr >> 6));
intptr_t metadata_base_address = reinterpret_cast<intptr_t>(MMTK_SIDE_LOG_BIT_BASE_ADDRESS);
auto metadata_base_val = ConstantInt::get(intptr_ty, metadata_base_address);
auto metadata_base_ptr = ConstantExpr::getIntToPtr(metadata_base_val, PointerType::get(i8_ty, 0));

auto parent_val = builder.CreatePtrToInt(parent, intptr_ty);
auto shr = builder.CreateLShr(parent_val, ConstantInt::get(intptr_ty, 6));
auto metadata_ptr = builder.CreateGEP(i8_ty, metadata_base_ptr, shr);

// intptr_t shift = (addr >> 3) & 0b111;
auto shift = builder.CreateAnd(builder.CreateLShr(parent_val, ConstantInt::get(intptr_ty, 3)), ConstantInt::get(intptr_ty, 7));
auto shift_i8 = builder.CreateTruncOrBitCast(shift, i8_ty);

// uint8_t byte_val = *meta_addr;
auto load_i8 = builder.CreateAlignedLoad(i8_ty, metadata_ptr, Align());

// if (((byte_val >> shift) & 1) == 1) {
auto shifted_load_i8 = builder.CreateLShr(load_i8, shift_i8);
auto masked = builder.CreateAnd(shifted_load_i8, ConstantInt::get(i8_ty, 1));
auto is_unlogged = builder.CreateICmpEQ(masked, ConstantInt::get(i8_ty, 1));

// object_reference_write_slow_call((void*) src, (void*) slot, (void*) target);
MDBuilder MDB(F.getContext());
SmallVector<uint32_t, 2> Weights{1, 9};

auto mayTriggerSlowpath = SplitBlockAndInsertIfThen(is_unlogged, target, false, MDB.createBranchWeights(Weights));
builder.SetInsertPoint(mayTriggerSlowpath);
builder.CreateCall(getOrDeclare(jl_intrinsics::queueGCRoot), { parent });
} else {
Function *wb_func = getOrDeclare(jl_intrinsics::queueGCRoot);
builder.CreateCall(wb_func, { parent });
}
} else {
// Using a plan that does not need write barriers
}
}
86 changes: 86 additions & 0 deletions src/llvm-final-gc-lowering-stock.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// This file is a part of Julia. License is MIT: https://julialang.org/license

#include "llvm-gc-interface-passes.h"

#define DEBUG_TYPE "stock_final_gc_lowering"
STATISTIC(GCAllocBytesCount, "Number of lowered GCAllocBytesFunc intrinsics");

Value* FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
{
++GCAllocBytesCount;
CallInst *newI;

IRBuilder<> builder(target);
auto ptls = target->getArgOperand(0);
auto type = target->getArgOperand(2);
uint64_t derefBytes = 0;
if (auto CI = dyn_cast<ConstantInt>(target->getArgOperand(1))) {
size_t sz = (size_t)CI->getZExtValue();
// This is strongly architecture and OS dependent
int osize;
int offset = jl_gc_classify_pools(sz, &osize);
if (offset < 0) {
newI = builder.CreateCall(
bigAllocFunc,
{ ptls, ConstantInt::get(T_size, sz + sizeof(void*)), type });
if (sz > 0)
derefBytes = sz;
}
else {
auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), offset);
auto pool_osize = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
newI = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize, type });
if (sz > 0)
derefBytes = sz;
}
} else {
auto size = builder.CreateZExtOrTrunc(target->getArgOperand(1), T_size);
// allocTypedFunc does not include the type tag in the allocation size!
newI = builder.CreateCall(allocTypedFunc, { ptls, size, type });
derefBytes = sizeof(void*);
}
newI->setAttributes(newI->getCalledFunction()->getAttributes());
unsigned align = std::max((unsigned)target->getRetAlign().valueOrOne().value(), (unsigned)sizeof(void*));
newI->addRetAttr(Attribute::getWithAlignment(F.getContext(), Align(align)));
if (derefBytes > 0)
newI->addDereferenceableRetAttr(derefBytes);
newI->takeName(target);
return newI;
}

void FinalLowerGC::lowerWriteBarrier(CallInst *target, Function &F) {
State S(F);
auto parent = target->getArgOperand(0);
if (std::all_of(target->op_begin() + 1, target->op_end(),
[parent, &S](Value *child) { return parent == child || IsPermRooted(child, &S); })) {
return;
}

IRBuilder<> builder(target);
builder.SetCurrentDebugLocation(target->getDebugLoc());
auto parBits = builder.CreateAnd(EmitLoadTag(builder, T_size, parent, tbaa_tag), GC_OLD_MARKED, "parent_bits");
auto parOldMarked = builder.CreateICmpEQ(parBits, ConstantInt::get(T_size, GC_OLD_MARKED), "parent_old_marked");
auto mayTrigTerm = SplitBlockAndInsertIfThen(parOldMarked, target, false);
builder.SetInsertPoint(mayTrigTerm);
mayTrigTerm->getParent()->setName("may_trigger_wb");
Value *anyChldNotMarked = NULL;
for (unsigned i = 1; i < target->arg_size(); i++) {
Value *child = target->getArgOperand(i);
Value *chldBit = builder.CreateAnd(EmitLoadTag(builder, T_size, child, tbaa_tag), GC_MARKED, "child_bit");
Value *chldNotMarked = builder.CreateICmpEQ(chldBit, ConstantInt::get(T_size, 0), "child_not_marked");
anyChldNotMarked = anyChldNotMarked ? builder.CreateOr(anyChldNotMarked, chldNotMarked) : chldNotMarked;
}
assert(anyChldNotMarked); // handled by all_of test above
MDBuilder MDB(parent->getContext());
SmallVector<uint32_t, 2> Weights{1, 9};
auto trigTerm = SplitBlockAndInsertIfThen(anyChldNotMarked, mayTrigTerm, false,
MDB.createBranchWeights(Weights));
trigTerm->getParent()->setName("trigger_wb");
builder.SetInsertPoint(trigTerm);
if (target->getCalledOperand() == write_barrier_func) {
builder.CreateCall(getOrDeclare(jl_intrinsics::queueGCRoot), parent);
}
else {
assert(false);
}
}
Loading