fix: dynamic apply for x86_64 on macOS/linux

bitwalker · bitwalker · commit 8d0dd934d679 · 2023-02-26T23:54:18.000-05:00
diff --git a/library/rt/src/function/apply/dynamic/asm/dynamic_apply_linux.s b/library/rt/src/function/apply/dynamic/asm/dynamic_apply_linux.s
@@ -10,26 +10,28 @@ __firefly_dynamic_apply:
     # At this point, the following registers are bound:
     #
     #   rdi <- callee
-    #   rsi <- argv
-    #   rdx <- argc
+    #   rsi <- process
+    #   rdx <- argv
+    #   rcx <- argc
     #
     # Save the parent base pointer for when control returns to this call frame.
-    # CFA directives will inform the unwinder to expect %rbp at the bottom of the
+    # CFA directives will inform the unwinder to expect rbp at the bottom of the
     # stack for this frame, so this should be the last value on the stack in the caller
     push rbp
     .cfi_def_cfa_offset 16
     .cfi_offset rbp, -16
     mov  rbp, rsp
     .cfi_def_cfa_register rbp
 
-    # Save our callee and argv pointers, and argc
-    mov    r10, rdi
-    mov    r11, rsi
-    mov    rax, rdx
+    # Save our callee, process and argv pointers, and argc
+    mov  r10, rdi
+    mov  r11, rsi
+    mov  rdi, rdx
+    mov  rax, rcx
 
     # Determine if spills are needed
     # In the common case in which they are not, we perform a tail call
-    cmp  rdx, 7
+    cmp  rcx, 6
     ja .L_dyn_call_spill
     
 .L_dyn_call_no_spill:
@@ -42,9 +44,9 @@ __firefly_dynamic_apply:
 
     # Calculate offset in jump table to block which handles the specific
     # number of registers we have arguments for, then jump to that block
-    lea    rcx, [rip + .L_dyn_call_jt]
-    mov    rcx, [rcx + rdx * 8]
-    jmp    rcx
+    lea    rdx, [rip + .L_dyn_call_jt]
+    mov    rax, [rdx + 8*rax]
+    jmp    rax
 
     # All of these basic blocks perform a tail call. As such,
     # the unwinder will skip over this frame should the callee
@@ -54,39 +56,39 @@ __firefly_dynamic_apply:
     jmp r10
 
 .L_dyn_call_regs1:
-    mov rdi, [r11]
+    mov rsi, [rdi]
     pop rbp
     jmp r10
 
 .L_dyn_call_regs2:
-    mov rdi, [r11]
-    mov rsi, [r11 + 8]
-    pop rbp
-    jmp r10
+    mov  rsi, [rdi]
+    mov  rdx, [rdi + 8]
+    pop  rbp
+    jmp  r10
 
 .L_dyn_call_regs3:
-    mov rdi, [r11]
-    mov rsi, [r11 + 8]
-    mov rdx, [r11 + 16]
-    pop rbp
-    jmp r10
+    mov  rsi, [rdi]
+    mov  rdx, [rdi + 8]
+    mov  rcx, [rdi + 16]
+    pop  rbp
+    jmp  r10
 
 .L_dyn_call_regs4:
-    mov rdi, [r11]
-    mov rsi, [r11 + 8]
-    mov rdx, [r11 + 16]
-    mov rcx, [r11 + 24]
-    pop rbp
-    jmp r10
+    mov  rsi, [rdi]
+    mov  rdx, [rdi + 8]
+    mov  rcx, [rdi + 16]
+    mov  r8, [rdi + 24]
+    pop  rbp
+    jmp  r10
 
 .L_dyn_call_regs5:
-    mov rdi, [r11]
-    mov rsi, [r11 + 8]
-    mov rdx, [r11 + 16]
-    mov rcx, [r11 + 24]
-    mov r8,  [r11 + 32]
-    pop rbp
-    jmp r10
+    mov  rsi, [rdi]
+    mov  rdx, [rdi + 8]
+    mov  rcx, [rdi + 16]
+    mov  r8, [rdi + 24]
+    mov  r9, [rdi + 32]
+    pop  rbp
+    jmp  r10
 
 .L_dyn_call_regs6:
     mov rdi, [r11]
@@ -105,7 +107,7 @@ __firefly_dynamic_apply:
 
     # Calculate spill count for later (rep uses rcx for the iteration count,
     # which in this case is the number of quadwords to copy)
-    mov  rcx, rdx
+    mov  r8, rcx
     sub  rcx, 6
 
     # Calculate spill space, and ensure it is rounded up to the nearest 16 bytes.
@@ -116,16 +118,19 @@ __firefly_dynamic_apply:
     sub rsp, rax
 
     # load source pointer (last item of argv)
-    lea rsi, [r11 + rdx * 8 + -8]
+    lea rsi, [rdi + r8 * 8 - 8]
     # load destination pointer (top of spill region)
-    lea rdi, [rsp + rcx * 8 + -8]
+    lea rdi, [rsp + rcx * 8 - 8]
     # copy rcx quadwords from rsi to rdi, in reverse
     std
     rep movsq
     cld
 
     # We've spilled arguments, so we have at least 6 args
-    mov  rdi, [r11]
+    mov  r8,  rdi # We need to move rdi to r11, but it is occupied, so temporarily move to r8
+    mov  rdi, r11 # Move process pointer to rdi
+    mov  r11, r8  # Move r8 to r11
+    mov  rsi, [r11]
     mov  rsi, [r11 + 8]
     mov  rdx, [r11 + 16]
     mov  rcx, [r11 + 24]
diff --git a/library/rt/src/function/apply/dynamic/asm/dynamic_apply_macos.s b/library/rt/src/function/apply/dynamic/asm/dynamic_apply_macos.s
@@ -8,8 +8,9 @@ L_dyn_call_begin:
     # At this point, the following registers are bound:
     #
     #   rdi <- callee
-    #   rsi <- argv
-    #   rdx <- argc
+    #   rsi <- process
+    #   rdx <- argv
+    #   rcx <- argc
     #
     # Save the parent base pointer for when control returns to this call frame.
     # CFA directives will inform the unwinder to expect rbp at the bottom of the
@@ -20,14 +21,15 @@ L_dyn_call_begin:
     mov  rbp, rsp
     .cfi_def_cfa_register rbp
 
-    # Save our callee and argv pointers, and argc
+    # Save our callee, process and argv pointers, and argc
     mov  r10, rdi
     mov  r11, rsi
-    mov  rax, rdx
+    mov  rdi, rdx
+    mov  rax, rcx
 
-    # Determine if spills are needed
+    # Determine if spills are needed (argc + 1 should be <= 8 when not needed)
     # In the common case in which they are not, we perform a tail call
-    cmp  rdx, 7
+    cmp  rcx, 6
     ja L_dyn_call_spill
 
 L_dyn_call_no_spill:
@@ -38,62 +40,52 @@ L_dyn_call_no_spill:
 
     # Calculate offset in jump table to block which handles the specific
     # number of registers we have arguments for, then jump to that block
-    lea  rcx, [rip + L_dyn_call_jt]
-    mov  rax, [rcx + rax * 4]
-    add  rax, rcx
-    jmp  [rax]
+    lea  rdx, [rip + L_dyn_call_jt]
+    movsxd  rax, dword ptr [rdx + 4*rax]
+    add  rax, rdx
+    jmp  rax
 
     # All of these basic blocks perform a tail call. As such,
     # the unwinder will skip over this frame should the callee
     # throw an exception
 L_dyn_call_regs0:
     pop  rbp
-    jmp  [r10]
+    jmp  r10
 
 L_dyn_call_regs1:
-    mov  rdi, [r11]
+    mov  rsi, [rdi]
     pop  rbp
-    jmp  [r10]
+    jmp  r10
 
 L_dyn_call_regs2:
-    mov  rdi, [r11]
-    mov  rsi, [r11 + 8]
+    mov  rsi, [rdi]
+    mov  rdx, [rdi + 8]
     pop  rbp
-    jmp  [r10]
+    jmp  r10
 
 L_dyn_call_regs3:
-    mov  rdi, [r11]
-    mov  rsi, [r11 + 8]
-    mov  rdx, [r11 + 16]
+    mov  rsi, [rdi]
+    mov  rdx, [rdi + 8]
+    mov  rcx, [rdi + 16]
     pop  rbp
-    jmp  [r10]
+    jmp  r10
 
 L_dyn_call_regs4:
-    mov  rdi, [r11]
-    mov  rsi, [r11 + 8]
-    mov  rdx, [r11 + 16]
-    mov  rcx, [r11 + 24]
+    mov  rsi, [rdi]
+    mov  rdx, [rdi + 8]
+    mov  rcx, [rdi + 16]
+    mov  r8, [rdi + 24]
     pop  rbp
-    jmp  [r10]
+    jmp  r10
 
 L_dyn_call_regs5:
-    mov  rdi, [r11]
-    mov  rsi, [r11 + 8]
-    mov  rdx, [r11 + 16]
-    mov  rcx, [r11 + 24]
-    mov  r8, [r11 + 32]
+    mov  rsi, [rdi]
+    mov  rdx, [rdi + 8]
+    mov  rcx, [rdi + 16]
+    mov  r8, [rdi + 24]
+    mov  r9, [rdi + 32]
     pop  rbp
-    jmp  [r10]
-
-L_dyn_call_regs6:
-    mov  rdi, [r11]
-    mov  rsi, [r11 + 8]
-    mov  rdx, [r11 + 16]
-    mov  rcx, [r11 + 24]
-    mov  r8, [r11 + 32]
-    mov  r9, [r11 + 40]
-    pop  rbp
-    jmp  [r10]
+    jmp  r10
 
 L_dyn_call_spill:
     # If we hit this block, we have identified that there are
@@ -102,7 +94,7 @@ L_dyn_call_spill:
 
     # Calculate spill count for later (rep uses rcx for the iteration count,
     # which in this case is the number of quadwords to copy)
-    mov  rcx, rdx
+    mov  r8, rcx
     sub  rcx, 6
 
     # Calculate spill space, and ensure it is rounded up to the nearest 16 bytes.
@@ -113,21 +105,24 @@ L_dyn_call_spill:
     sub  rsp, rax
 
     # load source pointer (last item of argv)
-    lea  rsi, [r11 + rdx * 8 - 8]
+    lea  rsi, [rdi + r8 * 8 - 8]
     # load destination pointer (top of spill region)
-    lea rdi, [rsp + rcx * 8 - 8]
+    lea  rdi, [rsp + rcx * 8 - 8]
     # copy rcx quadwords from rsi to rdi, in reverse
     std
     rep  movsq
     cld
 
     # We've spilled arguments, so we have at least 6 args
-    mov  rdi,   [r11]
-    mov  rsi,  [r11 + 8]
+    mov  r8,  rdi # We need to move rdi to r11, but it is occupied, so temporarily move to r8
+    mov  rdi, r11 # Move process pointer to rdi
+    mov  r11, r8  # Move r8 to r11
+    mov  rsi, [r11]
+    mov  rsi, [r11 + 8]
     mov  rdx, [r11 + 16]
     mov  rcx, [r11 + 24]
-    mov  r8, [r11 + 32]
-    mov  r9, [r11 + 40]
+    mov  r8,  [r11 + 32]
+    mov  r9,  [r11 + 40]
 
 L_dyn_call_exec:
     # If we spill arguments to the stack, we can't perform
@@ -141,7 +136,7 @@ L_dyn_call_exec:
     # This instruction will push the return address and jump,
     # and we can expect rbp to be the same as we left it upon
     # return.
-    call  [r10]
+    call  r10
 
 L_dyn_call_ret:
     # Non-tail call completed successfully
@@ -156,21 +151,19 @@ L_dyn_call_end:
     # a variable number of register-based arguments
     .p2align 2
     .data_region jt32
-    .set L_dyn_call_jt_entry0, L_dyn_call_exec-L_dyn_call_jt
+    .set L_dyn_call_jt_entry0, L_dyn_call_regs0-L_dyn_call_jt
     .set L_dyn_call_jt_entry1, L_dyn_call_regs1-L_dyn_call_jt
     .set L_dyn_call_jt_entry2, L_dyn_call_regs2-L_dyn_call_jt
     .set L_dyn_call_jt_entry3, L_dyn_call_regs3-L_dyn_call_jt
     .set L_dyn_call_jt_entry4, L_dyn_call_regs4-L_dyn_call_jt
     .set L_dyn_call_jt_entry5, L_dyn_call_regs5-L_dyn_call_jt
-    .set L_dyn_call_jt_entry6, L_dyn_call_regs6-L_dyn_call_jt
 L_dyn_call_jt:
     .long L_dyn_call_jt_entry0
     .long L_dyn_call_jt_entry1
     .long L_dyn_call_jt_entry2
     .long L_dyn_call_jt_entry3
     .long L_dyn_call_jt_entry4
     .long L_dyn_call_jt_entry5
-    .long L_dyn_call_jt_entry6
     .end_data_region
 
     # The following is the LSDA metadata for exception handling