Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring codegen passes that lower write barrier and allocation #57769

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ ifeq ($(JULIACODEGEN),LLVM)
GC_CODEGEN_SRCS := llvm-final-gc-lowering llvm-late-gc-lowering llvm-gc-invariant-verifier
ifeq (${USE_THIRD_PARTY_GC},mmtk)
FLAGS += -I$(MMTK_API_INC)
GC_CODEGEN_SRCS += llvm-late-gc-lowering-mmtk
GC_CODEGEN_SRCS += llvm-final-gc-lowering-mmtk
else
GC_CODEGEN_SRCS += llvm-late-gc-lowering-stock
GC_CODEGEN_SRCS += llvm-final-gc-lowering-stock
endif
CODEGEN_SRCS := codegen jitlayers aotcompile debuginfo disasm llvm-simdloop \
llvm-pass-helpers llvm-ptls llvm-propagate-addrspaces null_sysimage \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,31 @@

#include "llvm-gc-interface-passes.h"

Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
#define DEBUG_TYPE "mmtk_final_gc_lowering"
STATISTIC(GCAllocBytesCount, "Number of lowered GCAllocBytesFunc intrinsics");

Value* FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
{
assert(target->arg_size() == 3);
++GCAllocBytesCount;
CallInst *newI;

IRBuilder<> builder(target);
auto ptls = target->getArgOperand(0);
auto type = target->getArgOperand(2);
uint64_t derefBytes = 0;
if (auto CI = dyn_cast<ConstantInt>(target->getArgOperand(1))) {
size_t sz = (size_t)CI->getZExtValue();
// This is strongly architecture and OS dependent
int osize;
int offset = jl_gc_classify_pools(sz, &osize);
if (offset >= 0) {
if (offset < 0) {
newI = builder.CreateCall(
bigAllocFunc,
{ ptls, ConstantInt::get(T_size, sz + sizeof(void*)), type });
if (sz > 0)
derefBytes = sz;
}
else {
// In this case instead of lowering julia.gc_alloc_bytes to jl_gc_small_alloc
// We do a slowpath/fastpath check and lower it only on the slowpath, returning
// the cursor and updating it in the fastpath.
Expand Down Expand Up @@ -91,6 +103,70 @@ Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
return phiNode;
}
}
} else {
auto size = builder.CreateZExtOrTrunc(target->getArgOperand(1), T_size);
// allocTypedFunc does not include the type tag in the allocation size!
newI = builder.CreateCall(allocTypedFunc, { ptls, size, type });
derefBytes = sizeof(void*);
}
return target;
newI->setAttributes(newI->getCalledFunction()->getAttributes());
unsigned align = std::max((unsigned)target->getRetAlign().valueOrOne().value(), (unsigned)sizeof(void*));
newI->addRetAttr(Attribute::getWithAlignment(F.getContext(), Align(align)));
if (derefBytes > 0)
newI->addDereferenceableRetAttr(derefBytes);
newI->takeName(target);
return newI;
}


void FinalLowerGC::lowerWriteBarrier(CallInst *target, Function &F) {
auto parent = target->getArgOperand(0);
IRBuilder<> builder(target);
builder.SetCurrentDebugLocation(target->getDebugLoc());

// FIXME: Currently we call write barrier with the src object (parent).
// This works fine for object barrier for generational plans (such as stickyimmix), which does not use the target object at all.
// But for other MMTk plans, we need to be careful.
const bool INLINE_WRITE_BARRIER = true;
if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
if (INLINE_WRITE_BARRIER) {
auto i8_ty = Type::getInt8Ty(F.getContext());
auto intptr_ty = T_size;

// intptr_t addr = (intptr_t) (void*) src;
// uint8_t* meta_addr = (uint8_t*) (SIDE_METADATA_BASE_ADDRESS + (addr >> 6));
intptr_t metadata_base_address = reinterpret_cast<intptr_t>(MMTK_SIDE_LOG_BIT_BASE_ADDRESS);
auto metadata_base_val = ConstantInt::get(intptr_ty, metadata_base_address);
auto metadata_base_ptr = ConstantExpr::getIntToPtr(metadata_base_val, PointerType::get(i8_ty, 0));

auto parent_val = builder.CreatePtrToInt(parent, intptr_ty);
auto shr = builder.CreateLShr(parent_val, ConstantInt::get(intptr_ty, 6));
auto metadata_ptr = builder.CreateGEP(i8_ty, metadata_base_ptr, shr);

// intptr_t shift = (addr >> 3) & 0b111;
auto shift = builder.CreateAnd(builder.CreateLShr(parent_val, ConstantInt::get(intptr_ty, 3)), ConstantInt::get(intptr_ty, 7));
auto shift_i8 = builder.CreateTruncOrBitCast(shift, i8_ty);

// uint8_t byte_val = *meta_addr;
auto load_i8 = builder.CreateAlignedLoad(i8_ty, metadata_ptr, Align());

// if (((byte_val >> shift) & 1) == 1) {
auto shifted_load_i8 = builder.CreateLShr(load_i8, shift_i8);
auto masked = builder.CreateAnd(shifted_load_i8, ConstantInt::get(i8_ty, 1));
auto is_unlogged = builder.CreateICmpEQ(masked, ConstantInt::get(i8_ty, 1));

// object_reference_write_slow_call((void*) src, (void*) slot, (void*) target);
MDBuilder MDB(F.getContext());
SmallVector<uint32_t, 2> Weights{1, 9};

auto mayTriggerSlowpath = SplitBlockAndInsertIfThen(is_unlogged, target, false, MDB.createBranchWeights(Weights));
builder.SetInsertPoint(mayTriggerSlowpath);
builder.CreateCall(getOrDeclare(jl_intrinsics::queueGCRoot), { parent });
} else {
Function *wb_func = getOrDeclare(jl_intrinsics::queueGCRoot);
builder.CreateCall(wb_func, { parent });
}
} else {
// Using a plan that does not need write barriers
}
}
80 changes: 80 additions & 0 deletions src/llvm-final-gc-lowering-stock.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// This file is a part of Julia. License is MIT: https://julialang.org/license

#include "llvm-gc-interface-passes.h"

#define DEBUG_TYPE "stock_final_gc_lowering"
STATISTIC(GCAllocBytesCount, "Number of lowered GCAllocBytesFunc intrinsics");

Value* FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
{
++GCAllocBytesCount;
CallInst *newI;

IRBuilder<> builder(target);
auto ptls = target->getArgOperand(0);
auto type = target->getArgOperand(2);
uint64_t derefBytes = 0;
if (auto CI = dyn_cast<ConstantInt>(target->getArgOperand(1))) {
size_t sz = (size_t)CI->getZExtValue();
// This is strongly architecture and OS dependent
int osize;
int offset = jl_gc_classify_pools(sz, &osize);
if (offset < 0) {
newI = builder.CreateCall(
bigAllocFunc,
{ ptls, ConstantInt::get(T_size, sz + sizeof(void*)), type });
if (sz > 0)
derefBytes = sz;
}
else {
auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), offset);
auto pool_osize = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
newI = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize, type });
if (sz > 0)
derefBytes = sz;
}
} else {
auto size = builder.CreateZExtOrTrunc(target->getArgOperand(1), T_size);
// allocTypedFunc does not include the type tag in the allocation size!
newI = builder.CreateCall(allocTypedFunc, { ptls, size, type });
derefBytes = sizeof(void*);
}
newI->setAttributes(newI->getCalledFunction()->getAttributes());
unsigned align = std::max((unsigned)target->getRetAlign().valueOrOne().value(), (unsigned)sizeof(void*));
newI->addRetAttr(Attribute::getWithAlignment(F.getContext(), Align(align)));
if (derefBytes > 0)
newI->addDereferenceableRetAttr(derefBytes);
newI->takeName(target);
return newI;
}

void FinalLowerGC::lowerWriteBarrier(CallInst *target, Function &F) {
auto parent = target->getArgOperand(0);
IRBuilder<> builder(target);
builder.SetCurrentDebugLocation(target->getDebugLoc());
auto parBits = builder.CreateAnd(EmitLoadTag(builder, T_size, parent, tbaa_tag), GC_OLD_MARKED, "parent_bits");
auto parOldMarked = builder.CreateICmpEQ(parBits, ConstantInt::get(T_size, GC_OLD_MARKED), "parent_old_marked");
auto mayTrigTerm = SplitBlockAndInsertIfThen(parOldMarked, target, false);
builder.SetInsertPoint(mayTrigTerm);
mayTrigTerm->getParent()->setName("may_trigger_wb");
Value *anyChldNotMarked = NULL;
for (unsigned i = 1; i < target->arg_size(); i++) {
Value *child = target->getArgOperand(i);
Value *chldBit = builder.CreateAnd(EmitLoadTag(builder, T_size, child, tbaa_tag), GC_MARKED, "child_bit");
Value *chldNotMarked = builder.CreateICmpEQ(chldBit, ConstantInt::get(T_size, 0), "child_not_marked");
anyChldNotMarked = anyChldNotMarked ? builder.CreateOr(anyChldNotMarked, chldNotMarked) : chldNotMarked;
}
assert(anyChldNotMarked); // handled by all_of test above
MDBuilder MDB(parent->getContext());
SmallVector<uint32_t, 2> Weights{1, 9};
auto trigTerm = SplitBlockAndInsertIfThen(anyChldNotMarked, mayTrigTerm, false,
MDB.createBranchWeights(Weights));
trigTerm->getParent()->setName("trigger_wb");
builder.SetInsertPoint(trigTerm);
if (target->getCalledOperand() == write_barrier_func) {
builder.CreateCall(getOrDeclare(jl_intrinsics::queueGCRoot), parent);
}
else {
assert(false);
}
}
93 changes: 46 additions & 47 deletions src/llvm-final-gc-lowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ STATISTIC(NewGCFrameCount, "Number of lowered newGCFrameFunc intrinsics");
STATISTIC(PushGCFrameCount, "Number of lowered pushGCFrameFunc intrinsics");
STATISTIC(PopGCFrameCount, "Number of lowered popGCFrameFunc intrinsics");
STATISTIC(GetGCFrameSlotCount, "Number of lowered getGCFrameSlotFunc intrinsics");
STATISTIC(GCAllocBytesCount, "Number of lowered GCAllocBytesFunc intrinsics");
STATISTIC(QueueGCRootCount, "Number of lowered queueGCRootFunc intrinsics");
STATISTIC(SafepointCount, "Number of lowered safepoint intrinsics");

Expand Down Expand Up @@ -117,51 +116,6 @@ void FinalLowerGC::lowerSafepoint(CallInst *target, Function &F)
target->eraseFromParent();
}

void FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
{
++GCAllocBytesCount;
assert(target->arg_size() == 3);
CallInst *newI;

IRBuilder<> builder(target);
auto ptls = target->getArgOperand(0);
auto type = target->getArgOperand(2);
uint64_t derefBytes = 0;
if (auto CI = dyn_cast<ConstantInt>(target->getArgOperand(1))) {
size_t sz = (size_t)CI->getZExtValue();
// This is strongly architecture and OS dependent
int osize;
int offset = jl_gc_classify_pools(sz, &osize);
if (offset < 0) {
newI = builder.CreateCall(
bigAllocFunc,
{ ptls, ConstantInt::get(T_size, sz + sizeof(void*)), type });
if (sz > 0)
derefBytes = sz;
}
else {
auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), offset);
auto pool_osize = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
newI = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize, type });
if (sz > 0)
derefBytes = sz;
}
} else {
auto size = builder.CreateZExtOrTrunc(target->getArgOperand(1), T_size);
// allocTypedFunc does not include the type tag in the allocation size!
newI = builder.CreateCall(allocTypedFunc, { ptls, size, type });
derefBytes = sizeof(void*);
}
newI->setAttributes(newI->getCalledFunction()->getAttributes());
unsigned align = std::max((unsigned)target->getRetAlign().valueOrOne().value(), (unsigned)sizeof(void*));
newI->addRetAttr(Attribute::getWithAlignment(F.getContext(), Align(align)));
if (derefBytes > 0)
newI->addDereferenceableRetAttr(derefBytes);
newI->takeName(target);
target->replaceAllUsesWith(newI);
target->eraseFromParent();
}

static bool hasUse(const JuliaPassContext &ctx, const jl_intrinsics::IntrinsicDescription &v)
{
auto Intr = ctx.getOrNull(v);
Expand All @@ -178,13 +132,17 @@ bool FinalLowerGC::shouldRunFinalGC()
should_run |= hasUse(*this, jl_intrinsics::GCAllocBytes);
should_run |= hasUse(*this, jl_intrinsics::queueGCRoot);
should_run |= hasUse(*this, jl_intrinsics::safepoint);
should_run |= (write_barrier_func && !write_barrier_func->use_empty());
return should_run;
}

bool FinalLowerGC::runOnFunction(Function &F)
{
initAll(*F.getParent());
pgcstack = getPGCstack(F);

auto gc_alloc_bytes = getOrNull(jl_intrinsics::GCAllocBytes);

if (!pgcstack || !shouldRunFinalGC())
goto verify_skip;

Expand All @@ -195,6 +153,41 @@ bool FinalLowerGC::runOnFunction(Function &F)
allocTypedFunc = getOrDeclare(jl_well_known::GCAllocTyped);
T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext());

// The replacement for these may require creating new BasicBlocks
// which messes up the loop below. Process them first
if (gc_alloc_bytes) {
for (auto it = gc_alloc_bytes->user_begin(); it != gc_alloc_bytes->user_end(); ) {
if (auto *CI = dyn_cast<CallInst>(*it)) {

assert(CI->getCalledOperand() == gc_alloc_bytes);

auto newI = lowerGCAllocBytes(CI, F);
if (newI != CI) {
++it;
CI->replaceAllUsesWith(newI);
CI->eraseFromParent();
continue;
}
}
++it;
}
}

// Write barriers should always be processed first since they may
// insert julia.queue_gc_root intrinsics
if (write_barrier_func) {
for (auto it = write_barrier_func->user_begin(); it != write_barrier_func->user_end(); ) {
if (auto *CI = dyn_cast<CallInst>(*it)) {
assert(CI->getCalledOperand() == write_barrier_func);
lowerWriteBarrier(CI, F);
++it;
CI->eraseFromParent();
continue;
}
++it;
}
}

// Lower all calls to supported intrinsics.
for (auto &BB : F) {
for (auto &I : make_early_inc_range(BB)) {
Expand All @@ -217,13 +210,13 @@ bool FinalLowerGC::runOnFunction(Function &F)
LOWER_INTRINSIC(getGCFrameSlot, lowerGetGCFrameSlot);
LOWER_INTRINSIC(pushGCFrame, lowerPushGCFrame);
LOWER_INTRINSIC(popGCFrame, lowerPopGCFrame);
LOWER_INTRINSIC(GCAllocBytes, lowerGCAllocBytes);
LOWER_INTRINSIC(queueGCRoot, lowerQueueGCRoot);
LOWER_INTRINSIC(safepoint, lowerSafepoint);

#undef LOWER_INTRINSIC
}
}

return true;
// Verify that skipping was in fact correct
verify_skip:
Expand All @@ -236,6 +229,12 @@ bool FinalLowerGC::runOnFunction(Function &F)

Value *callee = CI->getCalledOperand();
assert(callee);
if (write_barrier_func == callee) {
errs() << "Final-GC-lowering didn't eliminate all write barriers from '" << F.getName() << "', dumping entire module!\n\n";
errs() << *F.getParent() << "\n";
abort();
}

auto IS_INTRINSIC = [&](auto intrinsic) {
auto intrinsic2 = getOrNull(intrinsic);
if (intrinsic2 == callee) {
Expand Down
Loading