Skip to content

Commit e826c88

Browse files
committed
Refactoring passes that lower write barrier and allocation
1 parent 0d363a8 commit e826c88

7 files changed

+278
-195
lines changed

src/Makefile

+2-2
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,9 @@ ifeq ($(JULIACODEGEN),LLVM)
7272
GC_CODEGEN_SRCS := llvm-final-gc-lowering llvm-late-gc-lowering llvm-gc-invariant-verifier
7373
ifeq (${USE_THIRD_PARTY_GC},mmtk)
7474
FLAGS += -I$(MMTK_API_INC)
75-
GC_CODEGEN_SRCS += llvm-late-gc-lowering-mmtk
75+
GC_CODEGEN_SRCS += llvm-final-gc-lowering-mmtk
7676
else
77-
GC_CODEGEN_SRCS += llvm-late-gc-lowering-stock
77+
GC_CODEGEN_SRCS += llvm-final-gc-lowering-stock
7878
endif
7979
CODEGEN_SRCS := codegen jitlayers aotcompile debuginfo disasm llvm-simdloop \
8080
llvm-pass-helpers llvm-ptls llvm-propagate-addrspaces null_sysimage \

src/llvm-late-gc-lowering-mmtk.cpp src/llvm-final-gc-lowering-mmtk.cpp

+86-4
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,31 @@
22

33
#include "llvm-gc-interface-passes.h"
44

5-
Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
5+
#define DEBUG_TYPE "mmtk_final_gc_lowering"
6+
STATISTIC(GCAllocBytesCount, "Number of lowered GCAllocBytesFunc intrinsics");
7+
8+
Value* FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
69
{
7-
assert(target->arg_size() == 3);
10+
++GCAllocBytesCount;
11+
CallInst *newI;
812

913
IRBuilder<> builder(target);
1014
auto ptls = target->getArgOperand(0);
1115
auto type = target->getArgOperand(2);
16+
uint64_t derefBytes = 0;
1217
if (auto CI = dyn_cast<ConstantInt>(target->getArgOperand(1))) {
1318
size_t sz = (size_t)CI->getZExtValue();
1419
// This is strongly architecture and OS dependent
1520
int osize;
1621
int offset = jl_gc_classify_pools(sz, &osize);
17-
if (offset >= 0) {
22+
if (offset < 0) {
23+
newI = builder.CreateCall(
24+
bigAllocFunc,
25+
{ ptls, ConstantInt::get(T_size, sz + sizeof(void*)), type });
26+
if (sz > 0)
27+
derefBytes = sz;
28+
}
29+
else {
1830
// In this case instead of lowering julia.gc_alloc_bytes to jl_gc_small_alloc
1931
// We do a slowpath/fastpath check and lower it only on the slowpath, returning
2032
// the cursor and updating it in the fastpath.
@@ -91,6 +103,76 @@ Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
91103
return phiNode;
92104
}
93105
}
106+
} else {
107+
auto size = builder.CreateZExtOrTrunc(target->getArgOperand(1), T_size);
108+
// allocTypedFunc does not include the type tag in the allocation size!
109+
newI = builder.CreateCall(allocTypedFunc, { ptls, size, type });
110+
derefBytes = sizeof(void*);
94111
}
95-
return target;
112+
newI->setAttributes(newI->getCalledFunction()->getAttributes());
113+
unsigned align = std::max((unsigned)target->getRetAlign().valueOrOne().value(), (unsigned)sizeof(void*));
114+
newI->addRetAttr(Attribute::getWithAlignment(F.getContext(), Align(align)));
115+
if (derefBytes > 0)
116+
newI->addDereferenceableRetAttr(derefBytes);
117+
newI->takeName(target);
118+
return newI;
119+
}
120+
121+
122+
void FinalLowerGC::lowerWriteBarrier(CallInst *target, Function &F) {
123+
State S(F);
124+
auto parent = target->getArgOperand(0);
125+
if (std::all_of(target->op_begin() + 1, target->op_end(),
126+
[parent, &S](Value *child) { return parent == child || IsPermRooted(child, &S); })) {
127+
return;
128+
}
129+
130+
IRBuilder<> builder(target);
131+
builder.SetCurrentDebugLocation(target->getDebugLoc());
132+
133+
// FIXME: Currently we call write barrier with the src object (parent).
134+
// This works fine for object barrier for generational plans (such as stickyimmix), which does not use the target object at all.
135+
// But for other MMTk plans, we need to be careful.
136+
const bool INLINE_WRITE_BARRIER = true;
137+
if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
138+
if (INLINE_WRITE_BARRIER) {
139+
auto i8_ty = Type::getInt8Ty(F.getContext());
140+
auto intptr_ty = T_size;
141+
142+
// intptr_t addr = (intptr_t) (void*) src;
143+
// uint8_t* meta_addr = (uint8_t*) (SIDE_METADATA_BASE_ADDRESS + (addr >> 6));
144+
intptr_t metadata_base_address = reinterpret_cast<intptr_t>(MMTK_SIDE_LOG_BIT_BASE_ADDRESS);
145+
auto metadata_base_val = ConstantInt::get(intptr_ty, metadata_base_address);
146+
auto metadata_base_ptr = ConstantExpr::getIntToPtr(metadata_base_val, PointerType::get(i8_ty, 0));
147+
148+
auto parent_val = builder.CreatePtrToInt(parent, intptr_ty);
149+
auto shr = builder.CreateLShr(parent_val, ConstantInt::get(intptr_ty, 6));
150+
auto metadata_ptr = builder.CreateGEP(i8_ty, metadata_base_ptr, shr);
151+
152+
// intptr_t shift = (addr >> 3) & 0b111;
153+
auto shift = builder.CreateAnd(builder.CreateLShr(parent_val, ConstantInt::get(intptr_ty, 3)), ConstantInt::get(intptr_ty, 7));
154+
auto shift_i8 = builder.CreateTruncOrBitCast(shift, i8_ty);
155+
156+
// uint8_t byte_val = *meta_addr;
157+
auto load_i8 = builder.CreateAlignedLoad(i8_ty, metadata_ptr, Align());
158+
159+
// if (((byte_val >> shift) & 1) == 1) {
160+
auto shifted_load_i8 = builder.CreateLShr(load_i8, shift_i8);
161+
auto masked = builder.CreateAnd(shifted_load_i8, ConstantInt::get(i8_ty, 1));
162+
auto is_unlogged = builder.CreateICmpEQ(masked, ConstantInt::get(i8_ty, 1));
163+
164+
// object_reference_write_slow_call((void*) src, (void*) slot, (void*) target);
165+
MDBuilder MDB(F.getContext());
166+
SmallVector<uint32_t, 2> Weights{1, 9};
167+
168+
auto mayTriggerSlowpath = SplitBlockAndInsertIfThen(is_unlogged, target, false, MDB.createBranchWeights(Weights));
169+
builder.SetInsertPoint(mayTriggerSlowpath);
170+
builder.CreateCall(getOrDeclare(jl_intrinsics::queueGCRoot), { parent });
171+
} else {
172+
Function *wb_func = getOrDeclare(jl_intrinsics::queueGCRoot);
173+
builder.CreateCall(wb_func, { parent });
174+
}
175+
} else {
176+
// Using a plan that does not need write barriers
177+
}
96178
}

src/llvm-final-gc-lowering-stock.cpp

+86
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
// This file is a part of Julia. License is MIT: https://julialang.org/license
2+
3+
#include "llvm-gc-interface-passes.h"
4+
5+
#define DEBUG_TYPE "stock_final_gc_lowering"
6+
STATISTIC(GCAllocBytesCount, "Number of lowered GCAllocBytesFunc intrinsics");
7+
8+
Value* FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
9+
{
10+
++GCAllocBytesCount;
11+
CallInst *newI;
12+
13+
IRBuilder<> builder(target);
14+
auto ptls = target->getArgOperand(0);
15+
auto type = target->getArgOperand(2);
16+
uint64_t derefBytes = 0;
17+
if (auto CI = dyn_cast<ConstantInt>(target->getArgOperand(1))) {
18+
size_t sz = (size_t)CI->getZExtValue();
19+
// This is strongly architecture and OS dependent
20+
int osize;
21+
int offset = jl_gc_classify_pools(sz, &osize);
22+
if (offset < 0) {
23+
newI = builder.CreateCall(
24+
bigAllocFunc,
25+
{ ptls, ConstantInt::get(T_size, sz + sizeof(void*)), type });
26+
if (sz > 0)
27+
derefBytes = sz;
28+
}
29+
else {
30+
auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), offset);
31+
auto pool_osize = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
32+
newI = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize, type });
33+
if (sz > 0)
34+
derefBytes = sz;
35+
}
36+
} else {
37+
auto size = builder.CreateZExtOrTrunc(target->getArgOperand(1), T_size);
38+
// allocTypedFunc does not include the type tag in the allocation size!
39+
newI = builder.CreateCall(allocTypedFunc, { ptls, size, type });
40+
derefBytes = sizeof(void*);
41+
}
42+
newI->setAttributes(newI->getCalledFunction()->getAttributes());
43+
unsigned align = std::max((unsigned)target->getRetAlign().valueOrOne().value(), (unsigned)sizeof(void*));
44+
newI->addRetAttr(Attribute::getWithAlignment(F.getContext(), Align(align)));
45+
if (derefBytes > 0)
46+
newI->addDereferenceableRetAttr(derefBytes);
47+
newI->takeName(target);
48+
return newI;
49+
}
50+
51+
void FinalLowerGC::lowerWriteBarrier(CallInst *target, Function &F) {
52+
State S(F);
53+
auto parent = target->getArgOperand(0);
54+
if (std::all_of(target->op_begin() + 1, target->op_end(),
55+
[parent, &S](Value *child) { return parent == child || IsPermRooted(child, &S); })) {
56+
return;
57+
}
58+
59+
IRBuilder<> builder(target);
60+
builder.SetCurrentDebugLocation(target->getDebugLoc());
61+
auto parBits = builder.CreateAnd(EmitLoadTag(builder, T_size, parent, tbaa_tag), GC_OLD_MARKED, "parent_bits");
62+
auto parOldMarked = builder.CreateICmpEQ(parBits, ConstantInt::get(T_size, GC_OLD_MARKED), "parent_old_marked");
63+
auto mayTrigTerm = SplitBlockAndInsertIfThen(parOldMarked, target, false);
64+
builder.SetInsertPoint(mayTrigTerm);
65+
mayTrigTerm->getParent()->setName("may_trigger_wb");
66+
Value *anyChldNotMarked = NULL;
67+
for (unsigned i = 1; i < target->arg_size(); i++) {
68+
Value *child = target->getArgOperand(i);
69+
Value *chldBit = builder.CreateAnd(EmitLoadTag(builder, T_size, child, tbaa_tag), GC_MARKED, "child_bit");
70+
Value *chldNotMarked = builder.CreateICmpEQ(chldBit, ConstantInt::get(T_size, 0), "child_not_marked");
71+
anyChldNotMarked = anyChldNotMarked ? builder.CreateOr(anyChldNotMarked, chldNotMarked) : chldNotMarked;
72+
}
73+
assert(anyChldNotMarked); // handled by all_of test above
74+
MDBuilder MDB(parent->getContext());
75+
SmallVector<uint32_t, 2> Weights{1, 9};
76+
auto trigTerm = SplitBlockAndInsertIfThen(anyChldNotMarked, mayTrigTerm, false,
77+
MDB.createBranchWeights(Weights));
78+
trigTerm->getParent()->setName("trigger_wb");
79+
builder.SetInsertPoint(trigTerm);
80+
if (target->getCalledOperand() == write_barrier_func) {
81+
builder.CreateCall(getOrDeclare(jl_intrinsics::queueGCRoot), parent);
82+
}
83+
else {
84+
assert(false);
85+
}
86+
}

src/llvm-final-gc-lowering.cpp

+46-47
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ STATISTIC(NewGCFrameCount, "Number of lowered newGCFrameFunc intrinsics");
77
STATISTIC(PushGCFrameCount, "Number of lowered pushGCFrameFunc intrinsics");
88
STATISTIC(PopGCFrameCount, "Number of lowered popGCFrameFunc intrinsics");
99
STATISTIC(GetGCFrameSlotCount, "Number of lowered getGCFrameSlotFunc intrinsics");
10-
STATISTIC(GCAllocBytesCount, "Number of lowered GCAllocBytesFunc intrinsics");
1110
STATISTIC(QueueGCRootCount, "Number of lowered queueGCRootFunc intrinsics");
1211
STATISTIC(SafepointCount, "Number of lowered safepoint intrinsics");
1312

@@ -117,51 +116,6 @@ void FinalLowerGC::lowerSafepoint(CallInst *target, Function &F)
117116
target->eraseFromParent();
118117
}
119118

120-
void FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
121-
{
122-
++GCAllocBytesCount;
123-
assert(target->arg_size() == 3);
124-
CallInst *newI;
125-
126-
IRBuilder<> builder(target);
127-
auto ptls = target->getArgOperand(0);
128-
auto type = target->getArgOperand(2);
129-
uint64_t derefBytes = 0;
130-
if (auto CI = dyn_cast<ConstantInt>(target->getArgOperand(1))) {
131-
size_t sz = (size_t)CI->getZExtValue();
132-
// This is strongly architecture and OS dependent
133-
int osize;
134-
int offset = jl_gc_classify_pools(sz, &osize);
135-
if (offset < 0) {
136-
newI = builder.CreateCall(
137-
bigAllocFunc,
138-
{ ptls, ConstantInt::get(T_size, sz + sizeof(void*)), type });
139-
if (sz > 0)
140-
derefBytes = sz;
141-
}
142-
else {
143-
auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), offset);
144-
auto pool_osize = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
145-
newI = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize, type });
146-
if (sz > 0)
147-
derefBytes = sz;
148-
}
149-
} else {
150-
auto size = builder.CreateZExtOrTrunc(target->getArgOperand(1), T_size);
151-
// allocTypedFunc does not include the type tag in the allocation size!
152-
newI = builder.CreateCall(allocTypedFunc, { ptls, size, type });
153-
derefBytes = sizeof(void*);
154-
}
155-
newI->setAttributes(newI->getCalledFunction()->getAttributes());
156-
unsigned align = std::max((unsigned)target->getRetAlign().valueOrOne().value(), (unsigned)sizeof(void*));
157-
newI->addRetAttr(Attribute::getWithAlignment(F.getContext(), Align(align)));
158-
if (derefBytes > 0)
159-
newI->addDereferenceableRetAttr(derefBytes);
160-
newI->takeName(target);
161-
target->replaceAllUsesWith(newI);
162-
target->eraseFromParent();
163-
}
164-
165119
static bool hasUse(const JuliaPassContext &ctx, const jl_intrinsics::IntrinsicDescription &v)
166120
{
167121
auto Intr = ctx.getOrNull(v);
@@ -178,13 +132,17 @@ bool FinalLowerGC::shouldRunFinalGC()
178132
should_run |= hasUse(*this, jl_intrinsics::GCAllocBytes);
179133
should_run |= hasUse(*this, jl_intrinsics::queueGCRoot);
180134
should_run |= hasUse(*this, jl_intrinsics::safepoint);
135+
should_run |= (write_barrier_func && !write_barrier_func->use_empty());
181136
return should_run;
182137
}
183138

184139
bool FinalLowerGC::runOnFunction(Function &F)
185140
{
186141
initAll(*F.getParent());
187142
pgcstack = getPGCstack(F);
143+
144+
auto gc_alloc_bytes = getOrNull(jl_intrinsics::GCAllocBytes);
145+
188146
if (!pgcstack || !shouldRunFinalGC())
189147
goto verify_skip;
190148

@@ -195,6 +153,41 @@ bool FinalLowerGC::runOnFunction(Function &F)
195153
allocTypedFunc = getOrDeclare(jl_well_known::GCAllocTyped);
196154
T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext());
197155

156+
// The replacement for these may require creating new BasicBlocks
157+
// which messes up the loop below. Process them first
158+
if (gc_alloc_bytes) {
159+
for (auto it = gc_alloc_bytes->user_begin(); it != gc_alloc_bytes->user_end(); ) {
160+
if (auto *CI = dyn_cast<CallInst>(*it)) {
161+
162+
assert(CI->getCalledOperand() == gc_alloc_bytes);
163+
164+
auto newI = lowerGCAllocBytes(CI, F);
165+
if (newI != CI) {
166+
++it;
167+
CI->replaceAllUsesWith(newI);
168+
CI->eraseFromParent();
169+
continue;
170+
}
171+
}
172+
++it;
173+
}
174+
}
175+
176+
// Write barriers should always be processed first since they may
177+
// insert julia.queue_gc_root intrinsics
178+
if (write_barrier_func) {
179+
for (auto it = write_barrier_func->user_begin(); it != write_barrier_func->user_end(); ) {
180+
if (auto *CI = dyn_cast<CallInst>(*it)) {
181+
assert(CI->getCalledOperand() == write_barrier_func);
182+
lowerWriteBarrier(CI, F);
183+
++it;
184+
CI->eraseFromParent();
185+
continue;
186+
}
187+
++it;
188+
}
189+
}
190+
198191
// Lower all calls to supported intrinsics.
199192
for (auto &BB : F) {
200193
for (auto &I : make_early_inc_range(BB)) {
@@ -217,13 +210,13 @@ bool FinalLowerGC::runOnFunction(Function &F)
217210
LOWER_INTRINSIC(getGCFrameSlot, lowerGetGCFrameSlot);
218211
LOWER_INTRINSIC(pushGCFrame, lowerPushGCFrame);
219212
LOWER_INTRINSIC(popGCFrame, lowerPopGCFrame);
220-
LOWER_INTRINSIC(GCAllocBytes, lowerGCAllocBytes);
221213
LOWER_INTRINSIC(queueGCRoot, lowerQueueGCRoot);
222214
LOWER_INTRINSIC(safepoint, lowerSafepoint);
223215

224216
#undef LOWER_INTRINSIC
225217
}
226218
}
219+
227220
return true;
228221
// Verify that skipping was in fact correct
229222
verify_skip:
@@ -236,6 +229,12 @@ bool FinalLowerGC::runOnFunction(Function &F)
236229

237230
Value *callee = CI->getCalledOperand();
238231
assert(callee);
232+
if (write_barrier_func == callee) {
233+
errs() << "Final-GC-lowering didn't eliminate all write barriers from '" << F.getName() << "', dumping entire module!\n\n";
234+
errs() << *F.getParent() << "\n";
235+
abort();
236+
}
237+
239238
auto IS_INTRINSIC = [&](auto intrinsic) {
240239
auto intrinsic2 = getOrNull(intrinsic);
241240
if (intrinsic2 == callee) {

0 commit comments

Comments
 (0)