Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1043,6 +1043,28 @@ static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
return Is.size() <= 2;
}

// Check if a COPY instruction is cheap.
static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
assert(MI.isCopy() && "Expected COPY instruction");
const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();

// Cross-register-class copies (e.g., between GPR and FPR) are expensive on
// AArch64, typically requiring an FMOV instruction with a 2-6 cycle latency.
auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
if (Reg.isVirtual())
return MRI.getRegClass(Reg);
if (Reg.isPhysical())
return RI.getMinimalPhysRegClass(Reg);
return nullptr;
};
const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(0).getReg());
const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(1).getReg());
if (DstRC && SrcRC && !RI.getCommonSubClass(DstRC, SrcRC))
return false;

return MI.isAsCheapAsAMove();
}

// FIXME: this implementation should be micro-architecture dependent, so a
// micro-architecture target hook should be introduced here in future.
bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
Expand All @@ -1056,6 +1078,9 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
default:
return MI.isAsCheapAsAMove();

case TargetOpcode::COPY:
return isCheapCopy(MI, RI);

case AArch64::ADDWrs:
case AArch64::ADDXrs:
case AArch64::SUBWrs:
Expand Down
141 changes: 141 additions & 0 deletions llvm/test/CodeGen/AArch64/licm-regclass-copy.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=aarch64 -run-pass=early-machinelicm -o - %s | FileCheck %s

# This test verifies that cross-register-class copies (e.g., GPR to FPR)
# are hoisted out of loops by MachineLICM, as they are expensive on AArch64.

--- |
declare void @use_float(float)

define void @cross_regclass_copy_hoisted() {
ret void
}

define void @cross_regclass_physical_copy_hoisted() {
ret void
}
...
---
name: cross_regclass_copy_hoisted
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: cross_regclass_copy_hoisted
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $w0, $w1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr32all = COPY $wzr
; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr32all = COPY [[COPY2]]
; CHECK-NEXT: [[COPY4:%[0-9]+]]:fpr32 = COPY [[COPY1]]
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY3]], %bb.0, %5, %bb.2
; CHECK-NEXT: [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv
; CHECK-NEXT: Bcc 1, %bb.3, implicit $nzcv
; CHECK-NEXT: B %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
; CHECK-NEXT: $s0 = COPY [[COPY4]]
; CHECK-NEXT: BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0
; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr32all = COPY [[ADDWri]]
; CHECK-NEXT: B %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: RET_ReallyLR
bb.0:
liveins: $w0, $w1
%1:gpr32 = COPY $w0
%0:gpr32 = COPY $w1
%3:gpr32all = COPY $wzr
%2:gpr32all = COPY %3:gpr32all

bb.1:
%4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2
%6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv
Bcc 1, %bb.3, implicit $nzcv
B %bb.2

bb.2:
%7:fpr32 = COPY %0:gpr32
ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can drop the stack adjustments

$s0 = COPY %7:fpr32
BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
%8:gpr32sp = ADDWri %4:gpr32common, 1, 0
%5:gpr32all = COPY %8:gpr32sp
B %bb.1

bb.3:
RET_ReallyLR

...
---
name: cross_regclass_physical_copy_hoisted
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: cross_regclass_physical_copy_hoisted
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $w0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32all = COPY $wzr
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr32all = COPY [[COPY1]]
; CHECK-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY $wzr
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY2]], %bb.0, %4, %bb.2
; CHECK-NEXT: [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv
; CHECK-NEXT: Bcc 1, %bb.3, implicit $nzcv
; CHECK-NEXT: B %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
; CHECK-NEXT: $s0 = COPY [[COPY3]]
; CHECK-NEXT: BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0
; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr32all = COPY [[ADDWri]]
; CHECK-NEXT: B %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: RET_ReallyLR
bb.0:
liveins: $w0
%1:gpr32 = COPY $w0
%3:gpr32all = COPY $wzr
%2:gpr32all = COPY %3:gpr32all

bb.1:
%4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2
%6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv
Bcc 1, %bb.3, implicit $nzcv
B %bb.2

bb.2:
%7:fpr32 = COPY $wzr
ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
$s0 = COPY %7:fpr32
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This copy won't get hoisted because it gets clobbered by the call. Could you also add another copy to another physical register that will get hoisted?

AFAICT both tests hoist copies from integer -> FP registers, could you also add hoist able copies from FP -> integer registers?

BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
%8:gpr32sp = ADDWri %4:gpr32common, 1, 0
%5:gpr32all = COPY %8:gpr32sp
B %bb.1

bb.3:
RET_ReallyLR

...
Loading