diff --git a/src/Makefile b/src/Makefile index c605d6c70573b..d46f5fb767ca3 100644 --- a/src/Makefile +++ b/src/Makefile @@ -72,9 +72,9 @@ ifeq ($(JULIACODEGEN),LLVM) GC_CODEGEN_SRCS := llvm-final-gc-lowering llvm-late-gc-lowering llvm-gc-invariant-verifier ifeq (${USE_THIRD_PARTY_GC},mmtk) FLAGS += -I$(MMTK_API_INC) -GC_CODEGEN_SRCS += llvm-late-gc-lowering-mmtk +GC_CODEGEN_SRCS += llvm-final-gc-lowering-mmtk else -GC_CODEGEN_SRCS += llvm-late-gc-lowering-stock +GC_CODEGEN_SRCS += llvm-final-gc-lowering-stock endif CODEGEN_SRCS := codegen jitlayers aotcompile debuginfo disasm llvm-simdloop \ llvm-pass-helpers llvm-ptls llvm-propagate-addrspaces null_sysimage \ diff --git a/src/gc-interface.h b/src/gc-interface.h index 9108cdb331946..4011ad89c410e 100644 --- a/src/gc-interface.h +++ b/src/gc-interface.h @@ -214,6 +214,10 @@ struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty, unsigned align) JL_NOTSAF // The GC may use that information to, for instance, determine that such objects should // be treated as marked and belonged to the old generation in nursery collections. void jl_gc_notify_image_load(const char* img_data, size_t len); +// This function notifies the GC about memory addresses that are set when allocating the boot image. +// The GC may use that information to, for instance, determine that all objects in that chunk of memory should +// be treated as marked and belonged to the old generation in nursery collections. +void jl_gc_notify_image_alloc(const char* img_data, size_t len); // ========================================================================= // // Runtime Write-Barriers @@ -252,13 +256,11 @@ STATIC_INLINE void jl_gc_wb_knownold(const void *parent JL_UNUSED, const void *p // per field of the object being copied, but may be special-cased for performance reasons. STATIC_INLINE void jl_gc_multi_wb(const void *parent, const struct _jl_value_t *ptr) JL_NOTSAFEPOINT; - // Write-barrier function that must be used after copying fields of elements of genericmemory objects // into another. It should be semantically equivalent to triggering multiple write barriers – one // per field of the object being copied, but may be special-cased for performance reasons. STATIC_INLINE void jl_gc_wb_genericmemory_copy_ptr(const struct _jl_value_t *owner, struct _jl_genericmemory_t *src, char* src_p, size_t n, struct _jl_datatype_t *dt) JL_NOTSAFEPOINT; - // Similar to jl_gc_wb_genericmemory_copy but must be used when copying *boxed* elements of a genericmemory // object. Note that this barrier also performs the copying unlike jl_gc_wb_genericmemory_copy_ptr. // The parameters src_p, dest_p and n will be modified and will contain information about diff --git a/src/gc-mmtk.c b/src/gc-mmtk.c index a6650dd7cb68c..5f8524b3a58b9 100644 --- a/src/gc-mmtk.c +++ b/src/gc-mmtk.c @@ -1,5 +1,6 @@ #include "gc-common.h" #include "gc-tls-mmtk.h" +#include "gc-wb-mmtk.h" #include "mmtkMutator.h" #include "threading.h" @@ -861,10 +862,23 @@ STATIC_INLINE void* mmtk_immortal_alloc_fast(MMTkMutatorContext* mutator, size_t return bump_alloc_fast(mutator, (uintptr_t*)&allocator->cursor, (uintptr_t)allocator->limit, size, align, offset, 1); } +inline void mmtk_set_side_metadata(const void* side_metadata_base, void* obj) { + intptr_t addr = (intptr_t) obj; + uint8_t* meta_addr = (uint8_t*) side_metadata_base + (addr >> 6); + intptr_t shift = (addr >> 3) & 0b111; + while(1) { + uint8_t old_val = *meta_addr; + uint8_t new_val = old_val | (1 << shift); + if (jl_atomic_cmpswap((_Atomic(uint8_t)*)meta_addr, &old_val, new_val)) { + break; + } + } +} + STATIC_INLINE void mmtk_immortal_post_alloc_fast(MMTkMutatorContext* mutator, void* obj, size_t size) { - // FIXME: Similarly, for now, we do nothing - // but when supporting moving, this is where we set the valid object (VO) bit - // and log (old gen) bit + if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) { + mmtk_set_side_metadata(MMTK_SIDE_LOG_BIT_BASE_ADDRESS, obj); + } } JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int osize, size_t align, void *ty) @@ -1081,6 +1095,11 @@ void jl_gc_notify_image_load(const char* img_data, size_t len) mmtk_set_vm_space((void*)img_data, len); } +void jl_gc_notify_image_alloc(const char* img_data, size_t len) +{ + mmtk_immortal_region_post_alloc((void*)img_data, len); +} + // ========================================================================= // // Code specific to stock that is not supported by MMTk // ========================================================================= // @@ -1128,7 +1147,9 @@ _Atomic(int) gc_stack_free_idx = 0; JL_DLLEXPORT void jl_gc_queue_root(const struct _jl_value_t *ptr) JL_NOTSAFEPOINT { - mmtk_unreachable(); + jl_task_t *ct = jl_current_task; + jl_ptls_t ptls = ct->ptls; + mmtk_object_reference_write_slow(&ptls->gc_tls.mmtk_mutator, ptr, (const void*) 0); } JL_DLLEXPORT void jl_gc_queue_multiroot(const struct _jl_value_t *root, const void *stored, diff --git a/src/gc-stock.c b/src/gc-stock.c index 01453a30b2a4b..d240a68edb01a 100644 --- a/src/gc-stock.c +++ b/src/gc-stock.c @@ -4074,6 +4074,11 @@ void jl_gc_notify_image_load(const char* img_data, size_t len) // Do nothing } +void jl_gc_notify_image_alloc(const char* img_data, size_t len) +{ + // Do nothing +} + JL_DLLEXPORT const char* jl_gc_active_impl(void) { return "Built with stock GC"; } diff --git a/src/llvm-late-gc-lowering-mmtk.cpp b/src/llvm-final-gc-lowering-mmtk.cpp similarity index 56% rename from src/llvm-late-gc-lowering-mmtk.cpp rename to src/llvm-final-gc-lowering-mmtk.cpp index 5539c8dbcf153..9fc2439ea43c4 100644 --- a/src/llvm-late-gc-lowering-mmtk.cpp +++ b/src/llvm-final-gc-lowering-mmtk.cpp @@ -2,19 +2,31 @@ #include "llvm-gc-interface-passes.h" -Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F) +#define DEBUG_TYPE "mmtk_final_gc_lowering" +STATISTIC(GCAllocBytesCount, "Number of lowered GCAllocBytesFunc intrinsics"); + +Value* FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F) { - assert(target->arg_size() == 3); + ++GCAllocBytesCount; + CallInst *newI; IRBuilder<> builder(target); auto ptls = target->getArgOperand(0); auto type = target->getArgOperand(2); + uint64_t derefBytes = 0; if (auto CI = dyn_cast(target->getArgOperand(1))) { size_t sz = (size_t)CI->getZExtValue(); // This is strongly architecture and OS dependent int osize; int offset = jl_gc_classify_pools(sz, &osize); - if (offset >= 0) { + if (offset < 0) { + newI = builder.CreateCall( + bigAllocFunc, + { ptls, ConstantInt::get(T_size, sz + sizeof(void*)), type }); + if (sz > 0) + derefBytes = sz; + } + else { // In this case instead of lowering julia.gc_alloc_bytes to jl_gc_small_alloc // We do a slowpath/fastpath check and lower it only on the slowpath, returning // the cursor and updating it in the fastpath. @@ -91,6 +103,76 @@ Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F) return phiNode; } } + } else { + auto size = builder.CreateZExtOrTrunc(target->getArgOperand(1), T_size); + // allocTypedFunc does not include the type tag in the allocation size! + newI = builder.CreateCall(allocTypedFunc, { ptls, size, type }); + derefBytes = sizeof(void*); } - return target; + newI->setAttributes(newI->getCalledFunction()->getAttributes()); + unsigned align = std::max((unsigned)target->getRetAlign().valueOrOne().value(), (unsigned)sizeof(void*)); + newI->addRetAttr(Attribute::getWithAlignment(F.getContext(), Align(align))); + if (derefBytes > 0) + newI->addDereferenceableRetAttr(derefBytes); + newI->takeName(target); + return newI; +} + + +void FinalLowerGC::lowerWriteBarrier(CallInst *target, Function &F) { + State S(F); + auto parent = target->getArgOperand(0); + if (std::all_of(target->op_begin() + 1, target->op_end(), + [parent, &S](Value *child) { return parent == child || IsPermRooted(child, &S); })) { + return; + } + + IRBuilder<> builder(target); + builder.SetCurrentDebugLocation(target->getDebugLoc()); + + // FIXME: Currently we call write barrier with the src object (parent). + // This works fine for object barrier for generational plans (such as stickyimmix), which does not use the target object at all. + // But for other MMTk plans, we need to be careful. + const bool INLINE_WRITE_BARRIER = true; + if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) { + if (INLINE_WRITE_BARRIER) { + auto i8_ty = Type::getInt8Ty(F.getContext()); + auto intptr_ty = T_size; + + // intptr_t addr = (intptr_t) (void*) src; + // uint8_t* meta_addr = (uint8_t*) (SIDE_METADATA_BASE_ADDRESS + (addr >> 6)); + intptr_t metadata_base_address = reinterpret_cast(MMTK_SIDE_LOG_BIT_BASE_ADDRESS); + auto metadata_base_val = ConstantInt::get(intptr_ty, metadata_base_address); + auto metadata_base_ptr = ConstantExpr::getIntToPtr(metadata_base_val, PointerType::get(i8_ty, 0)); + + auto parent_val = builder.CreatePtrToInt(parent, intptr_ty); + auto shr = builder.CreateLShr(parent_val, ConstantInt::get(intptr_ty, 6)); + auto metadata_ptr = builder.CreateGEP(i8_ty, metadata_base_ptr, shr); + + // intptr_t shift = (addr >> 3) & 0b111; + auto shift = builder.CreateAnd(builder.CreateLShr(parent_val, ConstantInt::get(intptr_ty, 3)), ConstantInt::get(intptr_ty, 7)); + auto shift_i8 = builder.CreateTruncOrBitCast(shift, i8_ty); + + // uint8_t byte_val = *meta_addr; + auto load_i8 = builder.CreateAlignedLoad(i8_ty, metadata_ptr, Align()); + + // if (((byte_val >> shift) & 1) == 1) { + auto shifted_load_i8 = builder.CreateLShr(load_i8, shift_i8); + auto masked = builder.CreateAnd(shifted_load_i8, ConstantInt::get(i8_ty, 1)); + auto is_unlogged = builder.CreateICmpEQ(masked, ConstantInt::get(i8_ty, 1)); + + // object_reference_write_slow_call((void*) src, (void*) slot, (void*) target); + MDBuilder MDB(F.getContext()); + SmallVector Weights{1, 9}; + + auto mayTriggerSlowpath = SplitBlockAndInsertIfThen(is_unlogged, target, false, MDB.createBranchWeights(Weights)); + builder.SetInsertPoint(mayTriggerSlowpath); + builder.CreateCall(getOrDeclare(jl_intrinsics::queueGCRoot), { parent }); + } else { + Function *wb_func = getOrDeclare(jl_intrinsics::queueGCRoot); + builder.CreateCall(wb_func, { parent }); + } + } else { + // Using a plan that does not need write barriers + } } diff --git a/src/llvm-final-gc-lowering-stock.cpp b/src/llvm-final-gc-lowering-stock.cpp new file mode 100644 index 0000000000000..6a792309753cb --- /dev/null +++ b/src/llvm-final-gc-lowering-stock.cpp @@ -0,0 +1,86 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +#include "llvm-gc-interface-passes.h" + +#define DEBUG_TYPE "stock_final_gc_lowering" +STATISTIC(GCAllocBytesCount, "Number of lowered GCAllocBytesFunc intrinsics"); + +Value* FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F) +{ + ++GCAllocBytesCount; + CallInst *newI; + + IRBuilder<> builder(target); + auto ptls = target->getArgOperand(0); + auto type = target->getArgOperand(2); + uint64_t derefBytes = 0; + if (auto CI = dyn_cast(target->getArgOperand(1))) { + size_t sz = (size_t)CI->getZExtValue(); + // This is strongly architecture and OS dependent + int osize; + int offset = jl_gc_classify_pools(sz, &osize); + if (offset < 0) { + newI = builder.CreateCall( + bigAllocFunc, + { ptls, ConstantInt::get(T_size, sz + sizeof(void*)), type }); + if (sz > 0) + derefBytes = sz; + } + else { + auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), offset); + auto pool_osize = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize); + newI = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize, type }); + if (sz > 0) + derefBytes = sz; + } + } else { + auto size = builder.CreateZExtOrTrunc(target->getArgOperand(1), T_size); + // allocTypedFunc does not include the type tag in the allocation size! + newI = builder.CreateCall(allocTypedFunc, { ptls, size, type }); + derefBytes = sizeof(void*); + } + newI->setAttributes(newI->getCalledFunction()->getAttributes()); + unsigned align = std::max((unsigned)target->getRetAlign().valueOrOne().value(), (unsigned)sizeof(void*)); + newI->addRetAttr(Attribute::getWithAlignment(F.getContext(), Align(align))); + if (derefBytes > 0) + newI->addDereferenceableRetAttr(derefBytes); + newI->takeName(target); + return newI; +} + +void FinalLowerGC::lowerWriteBarrier(CallInst *target, Function &F) { + State S(F); + auto parent = target->getArgOperand(0); + if (std::all_of(target->op_begin() + 1, target->op_end(), + [parent, &S](Value *child) { return parent == child || IsPermRooted(child, &S); })) { + return; + } + + IRBuilder<> builder(target); + builder.SetCurrentDebugLocation(target->getDebugLoc()); + auto parBits = builder.CreateAnd(EmitLoadTag(builder, T_size, parent, tbaa_tag), GC_OLD_MARKED, "parent_bits"); + auto parOldMarked = builder.CreateICmpEQ(parBits, ConstantInt::get(T_size, GC_OLD_MARKED), "parent_old_marked"); + auto mayTrigTerm = SplitBlockAndInsertIfThen(parOldMarked, target, false); + builder.SetInsertPoint(mayTrigTerm); + mayTrigTerm->getParent()->setName("may_trigger_wb"); + Value *anyChldNotMarked = NULL; + for (unsigned i = 1; i < target->arg_size(); i++) { + Value *child = target->getArgOperand(i); + Value *chldBit = builder.CreateAnd(EmitLoadTag(builder, T_size, child, tbaa_tag), GC_MARKED, "child_bit"); + Value *chldNotMarked = builder.CreateICmpEQ(chldBit, ConstantInt::get(T_size, 0), "child_not_marked"); + anyChldNotMarked = anyChldNotMarked ? builder.CreateOr(anyChldNotMarked, chldNotMarked) : chldNotMarked; + } + assert(anyChldNotMarked); // handled by all_of test above + MDBuilder MDB(parent->getContext()); + SmallVector Weights{1, 9}; + auto trigTerm = SplitBlockAndInsertIfThen(anyChldNotMarked, mayTrigTerm, false, + MDB.createBranchWeights(Weights)); + trigTerm->getParent()->setName("trigger_wb"); + builder.SetInsertPoint(trigTerm); + if (target->getCalledOperand() == write_barrier_func) { + builder.CreateCall(getOrDeclare(jl_intrinsics::queueGCRoot), parent); + } + else { + assert(false); + } +} diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp index 7d3a233c0a720..b7e4e86a9c5bb 100644 --- a/src/llvm-final-gc-lowering.cpp +++ b/src/llvm-final-gc-lowering.cpp @@ -7,7 +7,6 @@ STATISTIC(NewGCFrameCount, "Number of lowered newGCFrameFunc intrinsics"); STATISTIC(PushGCFrameCount, "Number of lowered pushGCFrameFunc intrinsics"); STATISTIC(PopGCFrameCount, "Number of lowered popGCFrameFunc intrinsics"); STATISTIC(GetGCFrameSlotCount, "Number of lowered getGCFrameSlotFunc intrinsics"); -STATISTIC(GCAllocBytesCount, "Number of lowered GCAllocBytesFunc intrinsics"); STATISTIC(QueueGCRootCount, "Number of lowered queueGCRootFunc intrinsics"); STATISTIC(SafepointCount, "Number of lowered safepoint intrinsics"); @@ -117,51 +116,6 @@ void FinalLowerGC::lowerSafepoint(CallInst *target, Function &F) target->eraseFromParent(); } -void FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F) -{ - ++GCAllocBytesCount; - assert(target->arg_size() == 3); - CallInst *newI; - - IRBuilder<> builder(target); - auto ptls = target->getArgOperand(0); - auto type = target->getArgOperand(2); - uint64_t derefBytes = 0; - if (auto CI = dyn_cast(target->getArgOperand(1))) { - size_t sz = (size_t)CI->getZExtValue(); - // This is strongly architecture and OS dependent - int osize; - int offset = jl_gc_classify_pools(sz, &osize); - if (offset < 0) { - newI = builder.CreateCall( - bigAllocFunc, - { ptls, ConstantInt::get(T_size, sz + sizeof(void*)), type }); - if (sz > 0) - derefBytes = sz; - } - else { - auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), offset); - auto pool_osize = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize); - newI = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize, type }); - if (sz > 0) - derefBytes = sz; - } - } else { - auto size = builder.CreateZExtOrTrunc(target->getArgOperand(1), T_size); - // allocTypedFunc does not include the type tag in the allocation size! - newI = builder.CreateCall(allocTypedFunc, { ptls, size, type }); - derefBytes = sizeof(void*); - } - newI->setAttributes(newI->getCalledFunction()->getAttributes()); - unsigned align = std::max((unsigned)target->getRetAlign().valueOrOne().value(), (unsigned)sizeof(void*)); - newI->addRetAttr(Attribute::getWithAlignment(F.getContext(), Align(align))); - if (derefBytes > 0) - newI->addDereferenceableRetAttr(derefBytes); - newI->takeName(target); - target->replaceAllUsesWith(newI); - target->eraseFromParent(); -} - static bool hasUse(const JuliaPassContext &ctx, const jl_intrinsics::IntrinsicDescription &v) { auto Intr = ctx.getOrNull(v); @@ -178,6 +132,7 @@ bool FinalLowerGC::shouldRunFinalGC() should_run |= hasUse(*this, jl_intrinsics::GCAllocBytes); should_run |= hasUse(*this, jl_intrinsics::queueGCRoot); should_run |= hasUse(*this, jl_intrinsics::safepoint); + should_run |= (write_barrier_func && !write_barrier_func->use_empty()); return should_run; } @@ -185,6 +140,9 @@ bool FinalLowerGC::runOnFunction(Function &F) { initAll(*F.getParent()); pgcstack = getPGCstack(F); + + auto gc_alloc_bytes = getOrNull(jl_intrinsics::GCAllocBytes); + if (!pgcstack || !shouldRunFinalGC()) goto verify_skip; @@ -195,6 +153,41 @@ bool FinalLowerGC::runOnFunction(Function &F) allocTypedFunc = getOrDeclare(jl_well_known::GCAllocTyped); T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext()); + // The replacement for these may require creating new BasicBlocks + // which messes up the loop below. Process them first + if (gc_alloc_bytes) { + for (auto it = gc_alloc_bytes->user_begin(); it != gc_alloc_bytes->user_end(); ) { + if (auto *CI = dyn_cast(*it)) { + + assert(CI->getCalledOperand() == gc_alloc_bytes); + + auto newI = lowerGCAllocBytes(CI, F); + if (newI != CI) { + ++it; + CI->replaceAllUsesWith(newI); + CI->eraseFromParent(); + continue; + } + } + ++it; + } + } + + // Write barriers should always be processed first since they may + // insert julia.queue_gc_root intrinsics + if (write_barrier_func) { + for (auto it = write_barrier_func->user_begin(); it != write_barrier_func->user_end(); ) { + if (auto *CI = dyn_cast(*it)) { + assert(CI->getCalledOperand() == write_barrier_func); + lowerWriteBarrier(CI, F); + ++it; + CI->eraseFromParent(); + continue; + } + ++it; + } + } + // Lower all calls to supported intrinsics. for (auto &BB : F) { for (auto &I : make_early_inc_range(BB)) { @@ -217,13 +210,13 @@ bool FinalLowerGC::runOnFunction(Function &F) LOWER_INTRINSIC(getGCFrameSlot, lowerGetGCFrameSlot); LOWER_INTRINSIC(pushGCFrame, lowerPushGCFrame); LOWER_INTRINSIC(popGCFrame, lowerPopGCFrame); - LOWER_INTRINSIC(GCAllocBytes, lowerGCAllocBytes); LOWER_INTRINSIC(queueGCRoot, lowerQueueGCRoot); LOWER_INTRINSIC(safepoint, lowerSafepoint); #undef LOWER_INTRINSIC } } + return true; // Verify that skipping was in fact correct verify_skip: @@ -236,6 +229,12 @@ bool FinalLowerGC::runOnFunction(Function &F) Value *callee = CI->getCalledOperand(); assert(callee); + if (write_barrier_func == callee) { + errs() << "Final-GC-lowering didn't eliminate all write barriers from '" << F.getName() << "', dumping entire module!\n\n"; + errs() << *F.getParent() << "\n"; + abort(); + } + auto IS_INTRINSIC = [&](auto intrinsic) { auto intrinsic2 = getOrNull(intrinsic); if (intrinsic2 == callee) { diff --git a/src/llvm-gc-interface-passes.h b/src/llvm-gc-interface-passes.h index d1bb1fae01446..a3b93040e0f47 100644 --- a/src/llvm-gc-interface-passes.h +++ b/src/llvm-gc-interface-passes.h @@ -361,15 +361,11 @@ struct LateLowerGCFrame: private JuliaPassContext { void PlaceGCFrameStores(State &S, unsigned MinColorRoot, ArrayRef Colors, int PreAssignedColors, Value *GCFrame); void PlaceGCFrameReset(State &S, unsigned R, unsigned MinColorRoot, ArrayRef Colors, Value *GCFrame, Instruction *InsertBefore); void PlaceRootsAndUpdateCalls(ArrayRef Colors, int PreAssignedColors, State &S, std::map>); - void CleanupWriteBarriers(Function &F, State *S, const SmallVector &WriteBarriers, bool *CFGModified); bool CleanupIR(Function &F, State *S, bool *CFGModified); void NoteUseChain(State &S, BBState &BBS, User *TheUser); SmallVector GetPHIRefinements(PHINode *phi, State &S); void FixUpRefinements(ArrayRef PHINumbers, State &S); void RefineLiveSet(LargeSparseBitVector &LS, State &S, ArrayRef CalleeRoots); - Value *EmitTagPtr(IRBuilder<> &builder, Type *T, Type *T_size, Value *V); - Value *EmitLoadTag(IRBuilder<> &builder, Type *T_size, Value *V); - Value* lowerGCAllocBytesLate(CallInst *target, Function &F); }; // The final GC lowering pass. This pass lowers platform-agnostic GC @@ -405,7 +401,7 @@ struct FinalLowerGC: private JuliaPassContext { void lowerGetGCFrameSlot(CallInst *target, Function &F); // Lowers a `julia.gc_alloc_bytes` intrinsic. - void lowerGCAllocBytes(CallInst *target, Function &F); + Value* lowerGCAllocBytes(CallInst *target, Function &F); // Lowers a `julia.queue_gc_root` intrinsic. void lowerQueueGCRoot(CallInst *target, Function &F); @@ -413,8 +409,64 @@ struct FinalLowerGC: private JuliaPassContext { // Lowers a `julia.safepoint` intrinsic. void lowerSafepoint(CallInst *target, Function &F); + // Lowers a `julia.write_barrier` function. + void lowerWriteBarrier(CallInst *target, Function &F); + // Check if the pass should be run bool shouldRunFinalGC(); }; +// These are now used in LateLower and FinalLower + +// Size of T is assumed to be `sizeof(void*)` +inline Value *EmitTagPtr(IRBuilder<> &builder, Type *T, Type *T_size, Value *V) +{ + assert(T == T_size || isa(T)); + return builder.CreateInBoundsGEP(T, V, ConstantInt::get(T_size, -1), V->getName() + ".tag_addr"); +} + +inline Value *EmitLoadTag(IRBuilder<> &builder, Type *T_size, Value *V, llvm::MDNode *tbaa_tag) +{ + auto addr = EmitTagPtr(builder, T_size, T_size, V); + auto &M = *builder.GetInsertBlock()->getModule(); + LoadInst *load = builder.CreateAlignedLoad(T_size, addr, M.getDataLayout().getPointerABIAlignment(0), V->getName() + ".tag"); + load->setOrdering(AtomicOrdering::Unordered); + load->setMetadata(LLVMContext::MD_tbaa, tbaa_tag); + MDBuilder MDB(load->getContext()); + auto *NullInt = ConstantInt::get(T_size, 0); + // We can be sure that the tag is at least 16 (1<<4) + // Hopefully this is enough to convince LLVM that the value is still not NULL + // after masking off the tag bits + auto *NonNullInt = ConstantExpr::getAdd(NullInt, ConstantInt::get(T_size, 16)); + load->setMetadata(LLVMContext::MD_range, MDB.createRange(NonNullInt, NullInt)); + return load; +} + +// FIXME: Should these be moved to llvm-final-gc-lowering.cpp? + +// Enable this optimization only on LLVM 4.0+ since this cause LLVM to optimize +// constant store loop to produce a `memset_pattern16` with a global variable +// that's initialized by `addrspacecast`. Such a global variable is not supported by the backend. +// This is not a problem on 4.0+ since that transformation (in loop-idiom) is disabled +// for NI pointers. +static SmallVector *FindRefinements(Value *V, State *S) +{ + if (!S) + return nullptr; + auto it = S->AllPtrNumbering.find(V); + if (it == S->AllPtrNumbering.end()) + return nullptr; + auto rit = S->Refinements.find(it->second); + return rit != S->Refinements.end() && !rit->second.empty() ? &rit->second : nullptr; +} + +inline bool IsPermRooted(Value *V, State *S) +{ + if (isa(V)) + return true; + if (auto *RefinePtr = FindRefinements(V, S)) + return RefinePtr->size() == 1 && (*RefinePtr)[0] == -2; + return false; +} + #endif // LLVM_GC_PASSES_H diff --git a/src/llvm-late-gc-lowering-stock.cpp b/src/llvm-late-gc-lowering-stock.cpp deleted file mode 100644 index 2a11487773396..0000000000000 --- a/src/llvm-late-gc-lowering-stock.cpp +++ /dev/null @@ -1,9 +0,0 @@ -// This file is a part of Julia. License is MIT: https://julialang.org/license - -#include "llvm-gc-interface-passes.h" - -Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F) -{ - // Do nothing for the stock GC - return target; -} diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index a41e947c0b6b3..2614c62e0fdd2 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -1898,55 +1898,6 @@ std::pair, int> LateLowerGCFrame::ColorRoots(const State &S) return {Colors, PreAssignedColors}; } -// Size of T is assumed to be `sizeof(void*)` -Value *LateLowerGCFrame::EmitTagPtr(IRBuilder<> &builder, Type *T, Type *T_size, Value *V) -{ - assert(T == T_size || isa(T)); - return builder.CreateInBoundsGEP(T, V, ConstantInt::get(T_size, -1), V->getName() + ".tag_addr"); -} - -Value *LateLowerGCFrame::EmitLoadTag(IRBuilder<> &builder, Type *T_size, Value *V) -{ - auto addr = EmitTagPtr(builder, T_size, T_size, V); - auto &M = *builder.GetInsertBlock()->getModule(); - LoadInst *load = builder.CreateAlignedLoad(T_size, addr, M.getDataLayout().getPointerABIAlignment(0), V->getName() + ".tag"); - load->setOrdering(AtomicOrdering::Unordered); - load->setMetadata(LLVMContext::MD_tbaa, tbaa_tag); - MDBuilder MDB(load->getContext()); - auto *NullInt = ConstantInt::get(T_size, 0); - // We can be sure that the tag is at least 16 (1<<4) - // Hopefully this is enough to convince LLVM that the value is still not NULL - // after masking off the tag bits - auto *NonNullInt = ConstantExpr::getAdd(NullInt, ConstantInt::get(T_size, 16)); - load->setMetadata(LLVMContext::MD_range, MDB.createRange(NonNullInt, NullInt)); - return load; -} - -// Enable this optimization only on LLVM 4.0+ since this cause LLVM to optimize -// constant store loop to produce a `memset_pattern16` with a global variable -// that's initialized by `addrspacecast`. Such a global variable is not supported by the backend. -// This is not a problem on 4.0+ since that transformation (in loop-idiom) is disabled -// for NI pointers. -static SmallVector *FindRefinements(Value *V, State *S) -{ - if (!S) - return nullptr; - auto it = S->AllPtrNumbering.find(V); - if (it == S->AllPtrNumbering.end()) - return nullptr; - auto rit = S->Refinements.find(it->second); - return rit != S->Refinements.end() && !rit->second.empty() ? &rit->second : nullptr; -} - -static bool IsPermRooted(Value *V, State *S) -{ - if (isa(V)) - return true; - if (auto *RefinePtr = FindRefinements(V, S)) - return RefinePtr->size() == 1 && (*RefinePtr)[0] == -2; - return false; -} - static inline void UpdatePtrNumbering(Value *From, Value *To, State *S) { if (!S) @@ -1965,50 +1916,6 @@ MDNode *createMutableTBAAAccessTag(MDNode *Tag) { return MDBuilder(Tag->getContext()).createMutableTBAAAccessTag(Tag); } -void LateLowerGCFrame::CleanupWriteBarriers(Function &F, State *S, const SmallVector &WriteBarriers, bool *CFGModified) { - auto T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext()); - for (auto CI : WriteBarriers) { - auto parent = CI->getArgOperand(0); - if (std::all_of(CI->op_begin() + 1, CI->op_end(), - [parent, &S](Value *child) { return parent == child || IsPermRooted(child, S); })) { - CI->eraseFromParent(); - continue; - } - if (CFGModified) { - *CFGModified = true; - } - - IRBuilder<> builder(CI); - builder.SetCurrentDebugLocation(CI->getDebugLoc()); - auto parBits = builder.CreateAnd(EmitLoadTag(builder, T_size, parent), GC_OLD_MARKED, "parent_bits"); - auto parOldMarked = builder.CreateICmpEQ(parBits, ConstantInt::get(T_size, GC_OLD_MARKED), "parent_old_marked"); - auto mayTrigTerm = SplitBlockAndInsertIfThen(parOldMarked, CI, false); - builder.SetInsertPoint(mayTrigTerm); - mayTrigTerm->getParent()->setName("may_trigger_wb"); - Value *anyChldNotMarked = NULL; - for (unsigned i = 1; i < CI->arg_size(); i++) { - Value *child = CI->getArgOperand(i); - Value *chldBit = builder.CreateAnd(EmitLoadTag(builder, T_size, child), GC_MARKED, "child_bit"); - Value *chldNotMarked = builder.CreateICmpEQ(chldBit, ConstantInt::get(T_size, 0), "child_not_marked"); - anyChldNotMarked = anyChldNotMarked ? builder.CreateOr(anyChldNotMarked, chldNotMarked) : chldNotMarked; - } - assert(anyChldNotMarked); // handled by all_of test above - MDBuilder MDB(parent->getContext()); - SmallVector Weights{1, 9}; - auto trigTerm = SplitBlockAndInsertIfThen(anyChldNotMarked, mayTrigTerm, false, - MDB.createBranchWeights(Weights)); - trigTerm->getParent()->setName("trigger_wb"); - builder.SetInsertPoint(trigTerm); - if (CI->getCalledOperand() == write_barrier_func) { - builder.CreateCall(getOrDeclare(jl_intrinsics::queueGCRoot), parent); - } - else { - assert(false); - } - CI->eraseFromParent(); - } -} - bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) { auto T_int32 = Type::getInt32Ty(F.getContext()); auto T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext()); @@ -2031,7 +1938,6 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) { #endif ); } - SmallVector write_barriers; for (BasicBlock &BB : F) { for (auto it = BB.begin(); it != BB.end();) { Instruction *I = &*it; @@ -2179,21 +2085,13 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) { assert(CI->arg_size() == 1); IRBuilder<> builder(CI); builder.SetCurrentDebugLocation(CI->getDebugLoc()); - auto tag = EmitLoadTag(builder, T_size, CI->getArgOperand(0)); + auto tag = EmitLoadTag(builder, T_size, CI->getArgOperand(0), tbaa_tag); auto masked = builder.CreateAnd(tag, ConstantInt::get(T_size, ~(uintptr_t)15)); auto typ = builder.CreateAddrSpaceCast(builder.CreateIntToPtr(masked, JuliaType::get_pjlvalue_ty(masked->getContext())), T_prjlvalue); typ->takeName(CI); CI->replaceAllUsesWith(typ); UpdatePtrNumbering(CI, typ, S); - } else if (write_barrier_func && callee == write_barrier_func) { - // The replacement for this requires creating new BasicBlocks - // which messes up the loop. Queue all of them to be replaced later. - assert(CI->arg_size() >= 1); - write_barriers.push_back(CI); - ChangesMade = true; - ++it; - continue; } else if ((call_func && callee == call_func) || (call2_func && callee == call2_func) || (call3_func && callee == call3_func)) { @@ -2309,7 +2207,6 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) { ChangesMade = true; } } - CleanupWriteBarriers(F, S, write_barriers, CFGModified); if (maxframeargs == 0 && Frame) { Frame->eraseFromParent(); } @@ -2576,30 +2473,6 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) { PlaceRootsAndUpdateCalls(Colors.first, Colors.second, S, CallFrames); CleanupIR(F, &S, CFGModified); - - // We lower the julia.gc_alloc_bytes intrinsic in this pass to insert slowpath/fastpath blocks for MMTk - // For now, we do nothing for the Stock GC - auto GCAllocBytes = getOrNull(jl_intrinsics::GCAllocBytes); - - if (GCAllocBytes) { - for (auto it = GCAllocBytes->user_begin(); it != GCAllocBytes->user_end(); ) { - if (auto *CI = dyn_cast(*it)) { - *CFGModified = true; - - assert(CI->getCalledOperand() == GCAllocBytes); - - auto newI = lowerGCAllocBytesLate(CI, F); - if (newI != CI) { - ++it; - CI->replaceAllUsesWith(newI); - CI->eraseFromParent(); - continue; - } - } - ++it; - } - } - return true; } diff --git a/src/staticdata.c b/src/staticdata.c index b007fc04eeb4b..7902c80c65c59 100644 --- a/src/staticdata.c +++ b/src/staticdata.c @@ -3506,6 +3506,7 @@ JL_DLLEXPORT jl_image_buf_t jl_preload_sysimg(const char *fname) ios_seek_end(&f); size_t len = ios_pos(&f); char *sysimg = (char*)jl_gc_perm_alloc(len, 0, 64, 0); + jl_gc_notify_image_alloc(sysimg, len); ios_seek(&f, 0); if (ios_readall(&f, sysimg, len) != len) @@ -4290,9 +4291,10 @@ static jl_value_t *jl_restore_package_image_from_stream(ios_t *f, jl_image_t *im char *sysimg; int success = !needs_permalloc; ios_seek(f, datastartpos); - if (needs_permalloc) + if (needs_permalloc) { sysimg = (char*)jl_gc_perm_alloc(len, 0, 64, 0); - else + jl_gc_notify_image_alloc(sysimg, len); + } else sysimg = &f->buf[f->bpos]; if (needs_permalloc) success = ios_readall(f, sysimg, len) == len;