Description
I've been struggling for a couple days now on why my CHIP-8 firmware doesn't work anymore, even when downgrading to the exact same rustc/llvm versions as before. Then I found out that it's not because of anything in the compiler, but it's because in the meantime I switched over to using Xargo for linking instead of a custom shell script.
The following program should demonstrate the problem. It is the result of compiling a switch, not unlike the one in #47. The stripped-down IR is as follows:
target datalayout = "e-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8"
target triple = "avr-atmel-none"
@switch.table = private unnamed_addr constant [4 x i8] c"\01\02\03\0C"
@switch.table.1 = private unnamed_addr constant [4 x i8] c"\04\05\06\0D"
@switch.table.2 = private unnamed_addr constant [4 x i8] c"\07\08\09\0E"
@switch.table.3 = private unnamed_addr constant [4 x i8] c"\0A\00\0B\0F"
; Function Attrs: noinline norecurse nounwind uwtable
define internal fastcc void @spi_setup() {
start:
%0 = load volatile i8, i8* inttoptr (i16 37 to i8*), align 1
%1 = or i8 %0, 4
store volatile i8 %1, i8* inttoptr (i16 37 to i8*), align 1
%2 = load volatile i8, i8* inttoptr (i16 36 to i8*), align 4
%3 = or i8 %2, 44
store volatile i8 %3, i8* inttoptr (i16 36 to i8*), align 4
%4 = load volatile i8, i8* inttoptr (i16 36 to i8*), align 4
%5 = and i8 %4, -17
store volatile i8 %5, i8* inttoptr (i16 36 to i8*), align 4
store volatile i8 80, i8* inttoptr (i16 76 to i8*), align 4
ret void
}
; Function Attrs: noinline norecurse nounwind uwtable
define internal fastcc void @spi_sync(i8) unnamed_addr #1 {
start:
store volatile i8 %0, i8* inttoptr (i16 78 to i8*), align 2
br label %bb2
bb2: ; preds = %bb2, %start
%1 = load volatile i8, i8* inttoptr (i16 77 to i8*), align 1
%2 = icmp sgt i8 %1, -1
br i1 %2, label %bb2, label %bb3
bb3: ; preds = %bb2
%3 = load volatile i8, i8* inttoptr (i16 78 to i8*), align 2
ret void
}
; Function Attrs: noinline uwtable
define internal fastcc i8 @coords_key(i16) unnamed_addr #2 {
start:
%abi_cast.sroa.0.0.extract.trunc = trunc i16 %0 to i8
%abi_cast.sroa.4.0.extract.shift = lshr i16 %0, 8
%abi_cast.sroa.4.0.extract.trunc = trunc i16 %abi_cast.sroa.4.0.extract.shift to i8
switch i8 %abi_cast.sroa.0.0.extract.trunc, label %bb17 [
i8 0, label %bb18
i8 1, label %bb19
i8 2, label %bb20
i8 3, label %bb21
]
bb17: ; preds = %bb21, %bb20, %bb19,
unreachable
bb18: ; preds = %start
%1 = icmp ult i8 %abi_cast.sroa.4.0.extract.trunc, 4
br i1 %1, label %switch.lookup, label %bb17
bb19: ; preds = %start
%2 = icmp ult i8 %abi_cast.sroa.4.0.extract.trunc, 4
br i1 %2, label %switch.lookup1, label %bb17
bb20: ; preds = %start
%3 = icmp ult i8 %abi_cast.sroa.4.0.extract.trunc, 4
br i1 %3, label %switch.lookup5, label %bb17
bb21: ; preds = %start
%4 = icmp ult i8 %abi_cast.sroa.4.0.extract.trunc, 4
br i1 %4, label %switch.lookup9, label %bb17
switch.lookup: ; preds = %bb18
%5 = sext i8 %abi_cast.sroa.4.0.extract.trunc to i16
%switch.gep = getelementptr inbounds [4 x i8], [4 x i8]* @switch.table, i16 0, i16 %5
%switch.load = load i8, i8* %switch.gep, align 1
ret i8 %switch.load
switch.lookup1: ; preds = %bb19
%6 = sext i8 %abi_cast.sroa.4.0.extract.trunc to i16
%switch.gep3 = getelementptr inbounds [4 x i8], [4 x i8]* @switch.table.1, i16 0, i16 %6
%switch.load4 = load i8, i8* %switch.gep3, align 1
ret i8 %switch.load4
switch.lookup5: ; preds = %bb20
%7 = sext i8 %abi_cast.sroa.4.0.extract.trunc to i16
%switch.gep7 = getelementptr inbounds [4 x i8], [4 x i8]* @switch.table.2, i16 0, i16 %7
%switch.load8 = load i8, i8* %switch.gep7, align 1
ret i8 %switch.load8
switch.lookup9: ; preds = %bb21
%8 = sext i8 %abi_cast.sroa.4.0.extract.trunc to i16
%switch.gep11 = getelementptr inbounds [4 x i8], [4 x i8]* @switch.table.3, i16 0, i16 %8
%switch.load12 = load i8, i8* %switch.gep11, align 1
ret i8 %switch.load12
}
; Function Attrs: noinline nounwind uwtable
define internal fastcc i16 @wait_key() {
ret i16 257 ; 0x0101
}
; Function Attrs: noreturn nounwind uwtable
define void @main() unnamed_addr #4 {
start:
tail call fastcc void @spi_setup() #7
%0 = tail call fastcc i16 @wait_key() #7
%1 = tail call fastcc i8 @coords_key(i16 %0) #7
tail call fastcc void @spi_sync(i8 %1) #7
ret void
}
I can compile this on f1e3b63, i.e. with all the horrible kludges that force lpm
to be used for loading globals, and the resulting assembly of coords_keys
looks fine:
0000001c <coords_key>:
1c: 81 30 cpi r24, 0x01 ; 1
1e: 01 f0 breq .+0 ; 0x20 <coords_key+0x4>
1e: R_AVR_7_PCREL .text+0x54
20: 82 30 cpi r24, 0x02 ; 2
22: 01 f0 breq .+0 ; 0x24 <coords_key+0x8>
22: R_AVR_7_PCREL .text+0x40
24: 83 30 cpi r24, 0x03 ; 3
26: 01 f0 breq .+0 ; 0x28 <coords_key+0xc>
26: R_AVR_7_PCREL .text+0x2a
28: 00 c0 rjmp .+0 ; 0x2a <LBB2_3>
28: R_AVR_13_PCREL .text+0x68
0000002a <LBB2_3>:
2a: 94 30 cpi r25, 0x04 ; 4
2c: 00 f0 brcs .+0 ; 0x2e <LBB2_3+0x4>
2c: R_AVR_7_PCREL .text+0x30
2e: 00 c0 rjmp .+0 ; 0x30 <LBB2_4>
2e: R_AVR_13_PCREL .text+0x7c
00000030 <LBB2_4>:
30: e9 2f mov r30, r25
32: f9 2f mov r31, r25
34: ff 0f add r31, r31
36: ff 0b sbc r31, r31
38: e0 50 subi r30, 0x00 ; 0
38: R_AVR_LO8_LDI_NEG .rodata.cst4+0xc
3a: f0 40 sbci r31, 0x00 ; 0
3a: R_AVR_HI8_LDI_NEG .rodata.cst4+0xc
3c: 84 91 lpm r24, Z
3e: 08 95 ret
00000040 <LBB2_5>:
40: 94 30 cpi r25, 0x04 ; 4
42: 00 f4 brcc .+0 ; 0x44 <LBB2_5+0x4>
42: R_AVR_7_PCREL .text+0x7c
44: e9 2f mov r30, r25
46: f9 2f mov r31, r25
48: ff 0f add r31, r31
4a: ff 0b sbc r31, r31
4c: e0 50 subi r30, 0x00 ; 0
4c: R_AVR_LO8_LDI_NEG .rodata.cst4+0x8
4e: f0 40 sbci r31, 0x00 ; 0
4e: R_AVR_HI8_LDI_NEG .rodata.cst4+0x8
50: 84 91 lpm r24, Z
52: 08 95 ret
00000054 <LBB2_7>:
54: 94 30 cpi r25, 0x04 ; 4
56: 00 f4 brcc .+0 ; 0x58 <LBB2_7+0x4>
56: R_AVR_7_PCREL .text+0x7c
58: e9 2f mov r30, r25
5a: f9 2f mov r31, r25
5c: ff 0f add r31, r31
5e: ff 0b sbc r31, r31
60: e0 50 subi r30, 0x00 ; 0
60: R_AVR_LO8_LDI_NEG .rodata.cst4+0x4
62: f0 40 sbci r31, 0x00 ; 0
62: R_AVR_HI8_LDI_NEG .rodata.cst4+0x4
64: 84 91 lpm r24, Z
66: 08 95 ret
00000068 <LBB2_9>:
68: 94 30 cpi r25, 0x04 ; 4
6a: 00 f4 brcc .+0 ; 0x6c <LBB2_9+0x4>
6a: R_AVR_7_PCREL .text+0x7c
6c: e9 2f mov r30, r25
6e: f9 2f mov r31, r25
70: ff 0f add r31, r31
72: ff 0b sbc r31, r31
74: e0 50 subi r30, 0x00 ; 0
74: R_AVR_LO8_LDI_NEG .rodata.cst4
76: f0 40 sbci r31, 0x00 ; 0
76: R_AVR_HI8_LDI_NEG .rodata.cst4
78: 84 91 lpm r24, Z
7a: 08 95 ret
In fact, the above code is correct in that I can link it with a custom linker script and get a working program:
$ llc --filetype=obj a.ll
$ avr-gcc -Os -Wl,--gc-sections -T lookup-text.ld -mmcu=atmega328p -o good.elf a.o
$ avr-objdump --syms good.elf |grep switch
00000118 l O .text 00000004 switch.table
0000011c l O .text 00000004 switch.table.1
00000120 l O .text 00000004 switch.table.2
00000124 l O .text 00000004 switch.table.3
but with the default linker script, the lookup tables end up in .data
:
$ avr-gcc -Os -Wl,--gc-sections -mmcu=atmega328p -o bad.elf a.o
$ avr-objdump --syms bad.elf |grep switch
00800100 l O .data 00000004 switch.table
00800104 l O .data 00000004 switch.table.1
00800108 l O .data 00000004 switch.table.2
0080010c l O .data 00000004 switch.table.3
The customized linker script lookup-text.ld
is as follows:
OUTPUT_FORMAT("elf32-avr","elf32-avr","elf32-avr")
OUTPUT_ARCH(avr:2)
MEMORY
{
text (rx) : ORIGIN = 0, LENGTH = 8K
data (rw!x) : ORIGIN = 0x800060, LENGTH = 0xffa0
eeprom (rw!x) : ORIGIN = 0x810000, LENGTH = 64K
fuse (rw!x) : ORIGIN = 0x820000, LENGTH = 1K
lock (rw!x) : ORIGIN = 0x830000, LENGTH = 1K
signature (rw!x) : ORIGIN = 0x840000, LENGTH = 1K
user_signatures (rw!x) : ORIGIN = 0x850000, LENGTH = 1K
}
SECTIONS
{
/* Read-only sections, merged into text segment: */
.hash : { *(.hash) }
.dynsym : { *(.dynsym) }
.dynstr : { *(.dynstr) }
.gnu.version : { *(.gnu.version) }
.gnu.version_d : { *(.gnu.version_d) }
.gnu.version_r : { *(.gnu.version_r) }
.rel.init : { *(.rel.init) }
.rela.init : { *(.rela.init) }
.rel.text :
{
*(.rel.text)
*(.rel.text.*)
*(.rel.gnu.linkonce.t*)
}
.rela.text :
{
*(.rela.text)
*(.rela.text.*)
*(.rela.gnu.linkonce.t*)
}
.rel.fini : { *(.rel.fini) }
.rela.fini : { *(.rela.fini) }
.rel.rodata :
{
*(.rel.rodata)
*(.rel.rodata.*)
*(.rel.gnu.linkonce.r*)
}
.rela.rodata :
{
*(.rela.rodata)
*(.rela.rodata.*)
*(.rela.gnu.linkonce.r*)
}
.rel.data :
{
*(.rel.data)
*(.rel.data.*)
*(.rel.gnu.linkonce.d*)
}
.rela.data :
{
*(.rela.data)
*(.rela.data.*)
*(.rela.gnu.linkonce.d*)
}
.rel.ctors : { *(.rel.ctors) }
.rela.ctors : { *(.rela.ctors) }
.rel.dtors : { *(.rel.dtors) }
.rela.dtors : { *(.rela.dtors) }
.rel.got : { *(.rel.got) }
.rela.got : { *(.rela.got) }
.rel.bss : { *(.rel.bss) }
.rela.bss : { *(.rela.bss) }
.rel.plt : { *(.rel.plt) }
.rela.plt : { *(.rela.plt) }
/* Internal text space or external memory. */
.text :
{
*(.vectors)
KEEP(*(.vectors))
/* For data that needs to reside in the lower 64k of progmem. */
*(.progmem.gcc*)
*(.progmem*)
. = ALIGN(2);
__trampolines_start = . ;
/* The jump trampolines for the 16-bit limited relocs will reside here. */
*(.trampolines)
*(.trampolines*)
__trampolines_end = . ;
/* For future tablejump instruction arrays for 3 byte pc devices.
We don't relax jump/call instructions within these sections. */
*(.jumptables)
*(.jumptables*)
/* For code that needs to reside in the lower 128k progmem. */
*(.lowtext)
*(.lowtext*)
__ctors_start = . ;
*(.ctors)
__ctors_end = . ;
__dtors_start = . ;
*(.dtors)
__dtors_end = . ;
KEEP(SORT(*)(.ctors))
KEEP(SORT(*)(.dtors))
/* From this point on, we don't bother about wether the insns are
below or above the 16 bits boundary. */
*(.init0) /* Start here after reset. */
KEEP (*(.init0))
*(.init1)
KEEP (*(.init1))
*(.init2) /* Clear __zero_reg__, set up stack pointer. */
KEEP (*(.init2))
*(.init3)
KEEP (*(.init3))
*(.init4) /* Initialize data and BSS. */
KEEP (*(.init4))
*(.init5)
KEEP (*(.init5))
*(.init6) /* C++ constructors. */
KEEP (*(.init6))
*(.init7)
KEEP (*(.init7))
*(.init8)
KEEP (*(.init8))
*(.init9) /* Call main(). */
KEEP (*(.init9))
*(.text)
. = ALIGN(2);
*(.text.*)
. = ALIGN(2);
*(.fini9) /* _exit() starts here. */
KEEP (*(.fini9))
*(.fini8)
KEEP (*(.fini8))
*(.fini7)
KEEP (*(.fini7))
*(.fini6) /* C++ destructors. */
KEEP (*(.fini6))
*(.fini5)
KEEP (*(.fini5))
*(.fini4)
KEEP (*(.fini4))
*(.fini3)
KEEP (*(.fini3))
*(.fini2)
KEEP (*(.fini2))
*(.fini1)
KEEP (*(.fini1))
*(.fini0) /* Infinite loop after program termination. */
KEEP (*(.fini0))
_etext = .;
*(.rodata) /* We need to include .rodata here if gcc is used */
*(.rodata*) /* with -fdata-sections. */
} > text
.data : AT (ADDR (.text) + SIZEOF (.text))
{
PROVIDE (__data_start = .) ;
/* --gc-sections will delete empty .data. This leads to wrong start
addresses for subsequent sections because -Tdata= from the command
line will have no effect, see PR13697. Thus, keep .data */
KEEP (*(.data))
*(.data*)
*(.gnu.linkonce.d*)
. = ALIGN(2);
_edata = . ;
PROVIDE (__data_end = .) ;
} > data
.bss : AT (ADDR (.bss))
{
PROVIDE (__bss_start = .) ;
*(.bss)
*(.bss*)
*(COMMON)
PROVIDE (__bss_end = .) ;
} > data
__data_load_start = LOADADDR(.data);
__data_load_end = __data_load_start + SIZEOF(.data);
/* Global data not cleared after reset. */
.noinit :
{
PROVIDE (__noinit_start = .) ;
*(.noinit*)
PROVIDE (__noinit_end = .) ;
_end = . ;
PROVIDE (__heap_start = .) ;
} > data
.eeprom :
{
KEEP (*(.eeprom*))
__eeprom_end = . ;
} > eeprom
.fuse :
{
KEEP(*(.fuse))
KEEP(*(.lfuse))
KEEP(*(.hfuse))
KEEP(*(.efuse))
} > fuse
.lock :
{
KEEP(*(.lock*))
} > lock
.signature :
{
KEEP(*(.signature*))
} > signature
.user_signatures :
{
KEEP(*(.user_signatures*))
} > user_signatures
/* Stabs debugging sections. */
.stab 0 : { *(.stab) }
.stabstr 0 : { *(.stabstr) }
.stab.excl 0 : { *(.stab.excl) }
.stab.exclstr 0 : { *(.stab.exclstr) }
.stab.index 0 : { *(.stab.index) }
.stab.indexstr 0 : { *(.stab.indexstr) }
.comment 0 : { *(.comment) }
/* DWARF debug sections.
Symbols in the DWARF debugging sections are relative to the beginning
of the section so we begin them at 0. */
/* DWARF 1 */
.debug 0 : { *(.debug) }
.line 0 : { *(.line) }
/* GNU DWARF 1 extensions */
.debug_srcinfo 0 : { *(.debug_srcinfo) }
.debug_sfnames 0 : { *(.debug_sfnames) }
/* DWARF 1.1 and DWARF 2 */
.debug_aranges 0 : { *(.debug_aranges) }
.debug_pubnames 0 : { *(.debug_pubnames) }
/* DWARF 2 */
.debug_info 0 : { *(.debug_info) *(.gnu.linkonce.wi.*) }
.debug_abbrev 0 : { *(.debug_abbrev) }
.debug_line 0 : { *(.debug_line) }
.debug_frame 0 : { *(.debug_frame) }
.debug_str 0 : { *(.debug_str) }
.debug_loc 0 : { *(.debug_loc) }
.debug_macinfo 0 : { *(.debug_macinfo) }
/* DWARF 3 */
.debug_pubtypes 0 : { *(.debug_pubtypes) }
.debug_ranges 0 : { *(.debug_ranges) }
/* DWARF Extension. */
.debug_macro 0 : { *(.debug_macro) }
}
The customization compared to the default avr-gcc
linker script is as follows (it moves .rodata
to .text
):
--- default.ld 2017-07-15 15:20:01.762345763 +0800
+++ lookup-text.ld 2017-07-15 15:01:27.386390685 +0800
@@ -146,6 +146,8 @@
*(.fini0) /* Infinite loop after program termination. */
KEEP (*(.fini0))
_etext = . ;
+ *(.rodata) /* We need to include .rodata here if gcc is used */
+ *(.rodata*) /* with -fdata-sections. */
} > text
.data : AT (ADDR (.text) + SIZEOF (.text))
{
@@ -155,8 +157,6 @@
line will have no effect, see PR13697. Thus, keep .data */
KEEP (*(.data))
*(.data*)
- *(.rodata) /* We need to include .rodata here if gcc is used */
- *(.rodata*) /* with -fdata-sections. */
*(.gnu.linkonce.d*)
. = ALIGN(2);
_edata = . ;