Skip to content
This repository was archived by the owner on Jun 10, 2024. It is now read-only.

Commit 8d0dd93

Browse files
committed
fix: dynamic apply for x86_64 on macOS/linux
1 parent 8b87eba commit 8d0dd93

File tree

2 files changed

+88
-90
lines changed

2 files changed

+88
-90
lines changed

library/rt/src/function/apply/dynamic/asm/dynamic_apply_linux.s

+43-38
Original file line numberDiff line numberDiff line change
@@ -10,26 +10,28 @@ __firefly_dynamic_apply:
1010
# At this point, the following registers are bound:
1111
#
1212
# rdi <- callee
13-
# rsi <- argv
14-
# rdx <- argc
13+
# rsi <- process
14+
# rdx <- argv
15+
# rcx <- argc
1516
#
1617
# Save the parent base pointer for when control returns to this call frame.
17-
# CFA directives will inform the unwinder to expect %rbp at the bottom of the
18+
# CFA directives will inform the unwinder to expect rbp at the bottom of the
1819
# stack for this frame, so this should be the last value on the stack in the caller
1920
push rbp
2021
.cfi_def_cfa_offset 16
2122
.cfi_offset rbp, -16
2223
mov rbp, rsp
2324
.cfi_def_cfa_register rbp
2425

25-
# Save our callee and argv pointers, and argc
26-
mov r10, rdi
27-
mov r11, rsi
28-
mov rax, rdx
26+
# Save our callee, process and argv pointers, and argc
27+
mov r10, rdi
28+
mov r11, rsi
29+
mov rdi, rdx
30+
mov rax, rcx
2931

3032
# Determine if spills are needed
3133
# In the common case in which they are not, we perform a tail call
32-
cmp rdx, 7
34+
cmp rcx, 6
3335
ja .L_dyn_call_spill
3436

3537
.L_dyn_call_no_spill:
@@ -42,9 +44,9 @@ __firefly_dynamic_apply:
4244

4345
# Calculate offset in jump table to block which handles the specific
4446
# number of registers we have arguments for, then jump to that block
45-
lea rcx, [rip + .L_dyn_call_jt]
46-
mov rcx, [rcx + rdx * 8]
47-
jmp rcx
47+
lea rdx, [rip + .L_dyn_call_jt]
48+
mov rax, [rdx + 8*rax]
49+
jmp rax
4850

4951
# All of these basic blocks perform a tail call. As such,
5052
# the unwinder will skip over this frame should the callee
@@ -54,39 +56,39 @@ __firefly_dynamic_apply:
5456
jmp r10
5557

5658
.L_dyn_call_regs1:
57-
mov rdi, [r11]
59+
mov rsi, [rdi]
5860
pop rbp
5961
jmp r10
6062

6163
.L_dyn_call_regs2:
62-
mov rdi, [r11]
63-
mov rsi, [r11 + 8]
64-
pop rbp
65-
jmp r10
64+
mov rsi, [rdi]
65+
mov rdx, [rdi + 8]
66+
pop rbp
67+
jmp r10
6668

6769
.L_dyn_call_regs3:
68-
mov rdi, [r11]
69-
mov rsi, [r11 + 8]
70-
mov rdx, [r11 + 16]
71-
pop rbp
72-
jmp r10
70+
mov rsi, [rdi]
71+
mov rdx, [rdi + 8]
72+
mov rcx, [rdi + 16]
73+
pop rbp
74+
jmp r10
7375

7476
.L_dyn_call_regs4:
75-
mov rdi, [r11]
76-
mov rsi, [r11 + 8]
77-
mov rdx, [r11 + 16]
78-
mov rcx, [r11 + 24]
79-
pop rbp
80-
jmp r10
77+
mov rsi, [rdi]
78+
mov rdx, [rdi + 8]
79+
mov rcx, [rdi + 16]
80+
mov r8, [rdi + 24]
81+
pop rbp
82+
jmp r10
8183

8284
.L_dyn_call_regs5:
83-
mov rdi, [r11]
84-
mov rsi, [r11 + 8]
85-
mov rdx, [r11 + 16]
86-
mov rcx, [r11 + 24]
87-
mov r8, [r11 + 32]
88-
pop rbp
89-
jmp r10
85+
mov rsi, [rdi]
86+
mov rdx, [rdi + 8]
87+
mov rcx, [rdi + 16]
88+
mov r8, [rdi + 24]
89+
mov r9, [rdi + 32]
90+
pop rbp
91+
jmp r10
9092

9193
.L_dyn_call_regs6:
9294
mov rdi, [r11]
@@ -105,7 +107,7 @@ __firefly_dynamic_apply:
105107

106108
# Calculate spill count for later (rep uses rcx for the iteration count,
107109
# which in this case is the number of quadwords to copy)
108-
mov rcx, rdx
110+
mov r8, rcx
109111
sub rcx, 6
110112

111113
# Calculate spill space, and ensure it is rounded up to the nearest 16 bytes.
@@ -116,16 +118,19 @@ __firefly_dynamic_apply:
116118
sub rsp, rax
117119

118120
# load source pointer (last item of argv)
119-
lea rsi, [r11 + rdx * 8 + -8]
121+
lea rsi, [rdi + r8 * 8 - 8]
120122
# load destination pointer (top of spill region)
121-
lea rdi, [rsp + rcx * 8 + -8]
123+
lea rdi, [rsp + rcx * 8 - 8]
122124
# copy rcx quadwords from rsi to rdi, in reverse
123125
std
124126
rep movsq
125127
cld
126128

127129
# We've spilled arguments, so we have at least 6 args
128-
mov rdi, [r11]
130+
mov r8, rdi # We need to move rdi to r11, but it is occupied, so temporarily move to r8
131+
mov rdi, r11 # Move process pointer to rdi
132+
mov r11, r8 # Move r8 to r11
133+
mov rsi, [r11]
129134
mov rsi, [r11 + 8]
130135
mov rdx, [r11 + 16]
131136
mov rcx, [r11 + 24]

library/rt/src/function/apply/dynamic/asm/dynamic_apply_macos.s

+45-52
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@ L_dyn_call_begin:
88
# At this point, the following registers are bound:
99
#
1010
# rdi <- callee
11-
# rsi <- argv
12-
# rdx <- argc
11+
# rsi <- process
12+
# rdx <- argv
13+
# rcx <- argc
1314
#
1415
# Save the parent base pointer for when control returns to this call frame.
1516
# CFA directives will inform the unwinder to expect rbp at the bottom of the
@@ -20,14 +21,15 @@ L_dyn_call_begin:
2021
mov rbp, rsp
2122
.cfi_def_cfa_register rbp
2223

23-
# Save our callee and argv pointers, and argc
24+
# Save our callee, process and argv pointers, and argc
2425
mov r10, rdi
2526
mov r11, rsi
26-
mov rax, rdx
27+
mov rdi, rdx
28+
mov rax, rcx
2729

28-
# Determine if spills are needed
30+
# Determine if spills are needed (argc + 1 should be <= 8 when not needed)
2931
# In the common case in which they are not, we perform a tail call
30-
cmp rdx, 7
32+
cmp rcx, 6
3133
ja L_dyn_call_spill
3234

3335
L_dyn_call_no_spill:
@@ -38,62 +40,52 @@ L_dyn_call_no_spill:
3840

3941
# Calculate offset in jump table to block which handles the specific
4042
# number of registers we have arguments for, then jump to that block
41-
lea rcx, [rip + L_dyn_call_jt]
42-
mov rax, [rcx + rax * 4]
43-
add rax, rcx
44-
jmp [rax]
43+
lea rdx, [rip + L_dyn_call_jt]
44+
movsxd rax, dword ptr [rdx + 4*rax]
45+
add rax, rdx
46+
jmp rax
4547

4648
# All of these basic blocks perform a tail call. As such,
4749
# the unwinder will skip over this frame should the callee
4850
# throw an exception
4951
L_dyn_call_regs0:
5052
pop rbp
51-
jmp [r10]
53+
jmp r10
5254

5355
L_dyn_call_regs1:
54-
mov rdi, [r11]
56+
mov rsi, [rdi]
5557
pop rbp
56-
jmp [r10]
58+
jmp r10
5759

5860
L_dyn_call_regs2:
59-
mov rdi, [r11]
60-
mov rsi, [r11 + 8]
61+
mov rsi, [rdi]
62+
mov rdx, [rdi + 8]
6163
pop rbp
62-
jmp [r10]
64+
jmp r10
6365

6466
L_dyn_call_regs3:
65-
mov rdi, [r11]
66-
mov rsi, [r11 + 8]
67-
mov rdx, [r11 + 16]
67+
mov rsi, [rdi]
68+
mov rdx, [rdi + 8]
69+
mov rcx, [rdi + 16]
6870
pop rbp
69-
jmp [r10]
71+
jmp r10
7072

7173
L_dyn_call_regs4:
72-
mov rdi, [r11]
73-
mov rsi, [r11 + 8]
74-
mov rdx, [r11 + 16]
75-
mov rcx, [r11 + 24]
74+
mov rsi, [rdi]
75+
mov rdx, [rdi + 8]
76+
mov rcx, [rdi + 16]
77+
mov r8, [rdi + 24]
7678
pop rbp
77-
jmp [r10]
79+
jmp r10
7880

7981
L_dyn_call_regs5:
80-
mov rdi, [r11]
81-
mov rsi, [r11 + 8]
82-
mov rdx, [r11 + 16]
83-
mov rcx, [r11 + 24]
84-
mov r8, [r11 + 32]
82+
mov rsi, [rdi]
83+
mov rdx, [rdi + 8]
84+
mov rcx, [rdi + 16]
85+
mov r8, [rdi + 24]
86+
mov r9, [rdi + 32]
8587
pop rbp
86-
jmp [r10]
87-
88-
L_dyn_call_regs6:
89-
mov rdi, [r11]
90-
mov rsi, [r11 + 8]
91-
mov rdx, [r11 + 16]
92-
mov rcx, [r11 + 24]
93-
mov r8, [r11 + 32]
94-
mov r9, [r11 + 40]
95-
pop rbp
96-
jmp [r10]
88+
jmp r10
9789

9890
L_dyn_call_spill:
9991
# If we hit this block, we have identified that there are
@@ -102,7 +94,7 @@ L_dyn_call_spill:
10294

10395
# Calculate spill count for later (rep uses rcx for the iteration count,
10496
# which in this case is the number of quadwords to copy)
105-
mov rcx, rdx
97+
mov r8, rcx
10698
sub rcx, 6
10799

108100
# Calculate spill space, and ensure it is rounded up to the nearest 16 bytes.
@@ -113,21 +105,24 @@ L_dyn_call_spill:
113105
sub rsp, rax
114106

115107
# load source pointer (last item of argv)
116-
lea rsi, [r11 + rdx * 8 - 8]
108+
lea rsi, [rdi + r8 * 8 - 8]
117109
# load destination pointer (top of spill region)
118-
lea rdi, [rsp + rcx * 8 - 8]
110+
lea rdi, [rsp + rcx * 8 - 8]
119111
# copy rcx quadwords from rsi to rdi, in reverse
120112
std
121113
rep movsq
122114
cld
123115

124116
# We've spilled arguments, so we have at least 6 args
125-
mov rdi, [r11]
126-
mov rsi, [r11 + 8]
117+
mov r8, rdi # We need to move rdi to r11, but it is occupied, so temporarily move to r8
118+
mov rdi, r11 # Move process pointer to rdi
119+
mov r11, r8 # Move r8 to r11
120+
mov rsi, [r11]
121+
mov rsi, [r11 + 8]
127122
mov rdx, [r11 + 16]
128123
mov rcx, [r11 + 24]
129-
mov r8, [r11 + 32]
130-
mov r9, [r11 + 40]
124+
mov r8, [r11 + 32]
125+
mov r9, [r11 + 40]
131126

132127
L_dyn_call_exec:
133128
# If we spill arguments to the stack, we can't perform
@@ -141,7 +136,7 @@ L_dyn_call_exec:
141136
# This instruction will push the return address and jump,
142137
# and we can expect rbp to be the same as we left it upon
143138
# return.
144-
call [r10]
139+
call r10
145140

146141
L_dyn_call_ret:
147142
# Non-tail call completed successfully
@@ -156,21 +151,19 @@ L_dyn_call_end:
156151
# a variable number of register-based arguments
157152
.p2align 2
158153
.data_region jt32
159-
.set L_dyn_call_jt_entry0, L_dyn_call_exec-L_dyn_call_jt
154+
.set L_dyn_call_jt_entry0, L_dyn_call_regs0-L_dyn_call_jt
160155
.set L_dyn_call_jt_entry1, L_dyn_call_regs1-L_dyn_call_jt
161156
.set L_dyn_call_jt_entry2, L_dyn_call_regs2-L_dyn_call_jt
162157
.set L_dyn_call_jt_entry3, L_dyn_call_regs3-L_dyn_call_jt
163158
.set L_dyn_call_jt_entry4, L_dyn_call_regs4-L_dyn_call_jt
164159
.set L_dyn_call_jt_entry5, L_dyn_call_regs5-L_dyn_call_jt
165-
.set L_dyn_call_jt_entry6, L_dyn_call_regs6-L_dyn_call_jt
166160
L_dyn_call_jt:
167161
.long L_dyn_call_jt_entry0
168162
.long L_dyn_call_jt_entry1
169163
.long L_dyn_call_jt_entry2
170164
.long L_dyn_call_jt_entry3
171165
.long L_dyn_call_jt_entry4
172166
.long L_dyn_call_jt_entry5
173-
.long L_dyn_call_jt_entry6
174167
.end_data_region
175168

176169
# The following is the LSDA metadata for exception handling

0 commit comments

Comments
 (0)