Skip to content

Commit d6271e5

Browse files
committed
fix alignment computation for nested objects
The alignment of a nested object (in C layouts) is not affected by the alignment of its parent container when computing a field offset. This can be strongly counter-intuitive (as it implies adding padding where it does not seem to provide value), but is required to match the C standard. It also permits users to write custom allocators which happen to provide alignment in excess of that which codegen may assume is guaranteed, and get the behavioral characteristics they intended to specify (without resorting to computing explicit padding). Addresses #57713 (comment)
1 parent a97137e commit d6271e5

File tree

4 files changed

+45
-24
lines changed

4 files changed

+45
-24
lines changed

doc/src/manual/calling-c-and-fortran-code.md

+15-9
Original file line numberDiff line numberDiff line change
@@ -547,15 +547,14 @@ is not valid, since the type layout of `T` is not known statically.
547547

548548
### SIMD Values
549549

550-
Note: This feature is currently implemented on 64-bit x86 and AArch64 platforms only.
551-
552550
If a C/C++ routine has an argument or return value that is a native SIMD type, the corresponding
553551
Julia type is a homogeneous tuple of `VecElement` that naturally maps to the SIMD type. Specifically:
554552

555-
> * The tuple must be the same size as the SIMD type. For example, a tuple representing an `__m128`
556-
> on x86 must have a size of 16 bytes.
557-
> * The element type of the tuple must be an instance of `VecElement{T}` where `T` is a primitive type that
558-
> is 1, 2, 4 or 8 bytes.
553+
> * The tuple must be the same size and elements as the SIMD type. For example, a tuple
554+
> representing an `__m128` on x86 must have a size of 16 bytes and Float32 elements.
555+
> * The element type of the tuple must be an instance of `VecElement{T}` where `T` is a
556+
> primitive type with a power-of-two number of bytes (e.g. 1, 2, 4, 8, 16, etc) such as
557+
> Int8 or Float64.
559558
560559
For instance, consider this C routine that uses AVX intrinsics:
561560

@@ -628,6 +627,10 @@ For translating a C argument list to Julia:
628627

629628
* `T`, where `T` is a concrete Julia type
630629
* argument value will be copied (passed by value)
630+
* `vector T` (or `__attribute__ vector_size`, or a typedef such as `__m128`)
631+
632+
* `NTuple{N, VecElement{T}}`, where `T` is a primitive Julia type of the correct size
633+
and N is the number of elements in the vector (equal to `vector_size / sizeof T`).
631634
* `void*`
632635

633636
* depends on how this parameter is used, first translate this to the intended pointer type, then
@@ -674,13 +677,16 @@ For translating a C return type to Julia:
674677
* `T`, where `T` is one of the primitive types: `char`, `int`, `long`, `short`, `float`, `double`,
675678
`complex`, `enum` or any of their `typedef` equivalents
676679

677-
* `T`, where `T` is an equivalent Julia Bits Type (per the table above)
678-
* if `T` is an `enum`, the argument type should be equivalent to `Cint` or `Cuint`
680+
* same as C argument list
679681
* argument value will be copied (returned by-value)
680682
* `struct T` (including typedef to a struct)
681683

682-
* `T`, where `T` is a concrete Julia Type
684+
* same as C argument list
683685
* argument value will be copied (returned by-value)
686+
687+
* `vector T`
688+
689+
* same as C argument list
684690
* `void*`
685691

686692
* depends on how this parameter is used, first translate this to the intended pointer type, then

src/cgutils.cpp

+5-8
Original file line numberDiff line numberDiff line change
@@ -3103,11 +3103,7 @@ static jl_cgval_t emit_getfield_knownidx(jl_codectx_t &ctx, const jl_cgval_t &st
31033103
else if (strct.ispointer()) {
31043104
auto tbaa = best_field_tbaa(ctx, strct, jt, idx, byte_offset);
31053105
Value *staddr = data_pointer(ctx, strct);
3106-
Value *addr;
3107-
if (jl_is_vecelement_type((jl_value_t*)jt) || byte_offset == 0)
3108-
addr = staddr; // VecElement types are unwrapped in LLVM.
3109-
else
3110-
addr = emit_ptrgep(ctx, staddr, byte_offset);
3106+
Value *addr = (byte_offset == 0 ? staddr : emit_ptrgep(ctx, staddr, byte_offset));
31113107
if (addr != staddr)
31123108
setNameWithField(ctx.emission_context, addr, get_objname, jt, idx, Twine("_ptr"));
31133109
if (jl_field_isptr(jt, idx)) {
@@ -3571,7 +3567,7 @@ static void union_alloca_type(jl_uniontype_t *ut,
35713567
[&](unsigned idx, jl_datatype_t *jt) {
35723568
if (!jl_is_datatype_singleton(jt)) {
35733569
size_t nb1 = jl_datatype_size(jt);
3574-
size_t align1 = jl_datatype_align(jt);
3570+
size_t align1 = julia_alignment((jl_value_t*)jt);
35753571
if (nb1 > nbytes)
35763572
nbytes = nb1;
35773573
if (align1 > align)
@@ -4133,10 +4129,11 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg
41334129

41344130
// choose whether we should perform the initialization with the struct as a IR value
41354131
// or instead initialize the stack buffer with stores (the later is nearly always better)
4132+
// although we do the former if it is a vector or could be a vector element
41364133
auto tracked = split_value_size(sty);
41374134
assert(CountTrackedPointers(lt).count == tracked.second);
41384135
bool init_as_value = false;
4139-
if (lt->isVectorTy() || jl_is_vecelement_type(ty)) { // maybe also check the size ?
4136+
if (lt->isVectorTy() || jl_special_vector_alignment(1, ty) != 0) {
41404137
init_as_value = true;
41414138
}
41424139

@@ -4343,7 +4340,7 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg
43434340
if (strct) {
43444341
jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack);
43454342
promotion_point = ai.decorateInst(ctx.builder.CreateMemSet(strct, ConstantInt::get(getInt8Ty(ctx.builder.getContext()), 0),
4346-
jl_datatype_size(ty), MaybeAlign(jl_datatype_align(ty))));
4343+
jl_datatype_size(ty), Align(julia_alignment(ty))));
43474344
}
43484345
ctx.builder.restoreIP(savedIP);
43494346
}

src/datatype.c

+18-7
Original file line numberDiff line numberDiff line change
@@ -300,9 +300,10 @@ static jl_datatype_layout_t *jl_get_layout(uint32_t sz,
300300
}
301301

302302
// Determine if homogeneous tuple with fields of type t will have
303-
// a special alignment beyond normal Julia rules.
303+
// a special alignment and vector-ABI beyond normal rules for aggregates.
304304
// Return special alignment if one exists, 0 if normal alignment rules hold.
305305
// A non-zero result *must* match the LLVM rules for a vector type <nfields x t>.
306+
// Matching the compiler's `__attribute__ vector_size` behavior.
306307
// For sake of Ahead-Of-Time (AOT) compilation, this routine has to work
307308
// without LLVM being available.
308309
unsigned jl_special_vector_alignment(size_t nfields, jl_value_t *t)
@@ -317,8 +318,12 @@ unsigned jl_special_vector_alignment(size_t nfields, jl_value_t *t)
317318
// motivating use case comes up for Julia, we reject pointers.
318319
return 0;
319320
size_t elsz = jl_datatype_size(ty);
320-
if (elsz != 1 && elsz != 2 && elsz != 4 && elsz != 8)
321-
// Only handle power-of-two-sized elements (for now)
321+
if (next_power_of_two(elsz) != elsz)
322+
// Only handle power-of-two-sized elements (for now), since other
323+
// lengths may be packed into very complicated arrangements (llvm pads
324+
// extra bits on most platforms when computing alignment but not when
325+
// computing type size, but adds no extra bytes for each element, so
326+
// their effect on offsets are never what you may naturally expect).
322327
return 0;
323328
size_t size = nfields * elsz;
324329
// Use natural alignment for this vector: this matches LLVM and clang.
@@ -723,9 +728,9 @@ void jl_compute_field_offsets(jl_datatype_t *st)
723728
}
724729
else {
725730
fsz = sizeof(void*);
726-
if (fsz > MAX_ALIGN)
727-
fsz = MAX_ALIGN;
728731
al = fsz;
732+
if (al > MAX_ALIGN)
733+
al = MAX_ALIGN;
729734
desc[i].isptr = 1;
730735
zeroinit = 1;
731736
npointers++;
@@ -769,8 +774,6 @@ void jl_compute_field_offsets(jl_datatype_t *st)
769774
if (al > alignm)
770775
alignm = al;
771776
}
772-
if (alignm > MAX_ALIGN)
773-
alignm = MAX_ALIGN; // We cannot guarantee alignments over 16 bytes because that's what our heap is aligned as
774777
if (LLT_ALIGN(sz, alignm) > sz) {
775778
haspadding = 1;
776779
sz = LLT_ALIGN(sz, alignm);
@@ -939,6 +942,14 @@ JL_DLLEXPORT jl_datatype_t *jl_new_primitivetype(jl_value_t *name, jl_module_t *
939942
uint32_t nbytes = (nbits + 7) / 8;
940943
uint32_t alignm = next_power_of_two(nbytes);
941944
# if defined(_CPU_X86_) && !defined(_OS_WINDOWS_)
945+
// datalayout strings are often weird: on 64-bit they usually follow fairly simple rules,
946+
// but on x86 32 bit platforms, sometimes 5 to 8 byte types are
947+
// 32-bit aligned even though the MAX_ALIGN (for types 9+ bytes) is 16
948+
// (except for f80 which is align 4 on Mingw, Linux, and BSDs--but align 16 on MSVC and Darwin)
949+
// https://llvm.org/doxygen/ARMTargetMachine_8cpp.html#adb29b487708f0dc2a940345b68649270
950+
// https://llvm.org/doxygen/AArch64TargetMachine_8cpp.html#a003a58caf135efbf7273c5ed84e700d7
951+
// https://llvm.org/doxygen/X86TargetMachine_8cpp.html#aefdbcd6131ef195da070cef7fdaf0532
952+
// 32-bit alignment is weird
942953
if (alignm == 8)
943954
alignm = 4;
944955
# endif

test/core.jl

+7
Original file line numberDiff line numberDiff line change
@@ -5773,6 +5773,13 @@ let ni128 = sizeof(FP128test) ÷ sizeof(Int),
57735773
@test reinterpret(UInt128, arr[2].fp) == expected
57745774
end
57755775

5776+
# make sure VecElement Tuple has the C alignment and ABI for supported types
5777+
primitive type Int24 24 end
5778+
@test Base.datatype_alignment(NTuple{10,VecElement{Int16}}) == 32
5779+
@test Base.datatype_alignment(NTuple{10,VecElement{Int24}}) == 4
5780+
@test Base.datatype_alignment(NTuple{10,VecElement{Int64}}) == 128
5781+
@test Base.datatype_alignment(NTuple{10,VecElement{Int128}}) == 256
5782+
57765783
# issue #21516
57775784
struct T21516
57785785
x::Vector{Float64}

0 commit comments

Comments
 (0)