diff --git a/amps.s b/amps.s index e5b722e8..e69de29b 100644 --- a/amps.s +++ b/amps.s @@ -1,509 +0,0 @@ - -C builder ---------- - -Welcome to the C building report for instructtest.c -module main functions: 1, blocks: 7, instructions: 90 -========================== -module main; - -global function i32 instruct_tests(i32 a, i32 b) { - instruct_tests_block0: { - blob<4:4> alloca = alloc 4 bytes aligned at 4; - ptr alloca_addr = &alloca; - blob<4:4> alloca_55 = alloc 4 bytes aligned at 4; - ptr alloca_addr_56 = &alloca_55; - blob<4:4> alloca_57 = alloc 4 bytes aligned at 4; - ptr alloca_addr_58 = &alloca_57; - blob<4:4> alloca_59 = alloc 4 bytes aligned at 4; - ptr alloca_addr_60 = &alloca_59; - blob<4:4> alloca_61 = alloc 4 bytes aligned at 4; - ptr alloca_addr_62 = &alloca_61; - blob<8:4> alloca_63 = alloc 8 bytes aligned at 4; - ptr alloca_addr_64 = &alloca_63; - blob<4:4> alloca_65 = alloc 4 bytes aligned at 4; - ptr alloca_addr_66 = &alloca_65; - jmp instruct_tests_block1; - } - - instruct_tests_block1: { - store a, alloca_addr; - store b, alloca_addr_56; - i32 num = 0; - store num, alloca_addr_58; - ptr num_0 = 4; - ptr tmp = alloca_addr_58 + num_0; - i32 tmp_load = load alloca_addr; - i32 num_1 = 4; - i32 tmp_2 = tmp_load + num_1; - store tmp_2, alloca_addr_60; - ptr num_3 = 4; - ptr tmp_4 = alloca_addr_60 + num_3; - i32 tmp_load_5 = load alloca_addr_56; - i32 num_6 = 255; - i32 tmp_7 = tmp_load_5 & num_6; - store tmp_7, alloca_addr_62; - ptr num_8 = 4; - ptr tmp_9 = alloca_addr_62 + num_8; - i32 tmp_load_10 = load alloca_addr_60; - i32 tmp_load_11 = load alloca_addr_62; - i32 tmp_12 = tmp_load_10 ^ tmp_load_11; - i32 tmp_load_13 = load alloca_addr_58; - i32 tmp_14 = tmp_load_13 ^ tmp_12; - store tmp_14, alloca_addr_58; - i32 num_15 = 0; - ptr typecast = cast num_15; - ptr num_16 = 4; - ptr tmp_17 = typecast * num_16; - ptr tmp_18 = alloca_addr_64 + tmp_17; - i32 tmp_load_19 = load alloca_addr_58; - store tmp_load_19, tmp_18; - i32 num_20 = 1; - ptr typecast_21 = cast num_20; - ptr num_22 = 4; - ptr tmp_23 = typecast_21 * num_22; - ptr tmp_24 = alloca_addr_64 + tmp_23; - i32 num_25 = 0; - ptr typecast_26 = cast num_25; - ptr num_27 = 4; - ptr tmp_28 = typecast_26 * num_27; - ptr tmp_29 = alloca_addr_64 + tmp_28; - i32 tmp_load_30 = load tmp_29; - i32 num_31 = 1; - i32 tmp_32 = tmp_load_30 + num_31; - store tmp_32, tmp_24; - i32 num_33 = 1; - ptr typecast_34 = cast num_33; - ptr num_35 = 4; - ptr tmp_36 = typecast_34 * num_35; - ptr tmp_37 = alloca_addr_64 + tmp_36; - i32 tmp_load_38 = load tmp_37; - store tmp_load_38, alloca_addr_66; - ptr num_39 = 4; - ptr tmp_40 = alloca_addr_66 + num_39; - i32 tmp_load_41 = load alloca_addr_66; - i32 tmp_load_42 = load alloca_addr_60; - cjmp tmp_load_41 >= tmp_load_42 ? instruct_tests_block3 : instruct_tests_block4; - } - - instruct_tests_block2: { - i32 tmp_load_51 = load alloca_addr_58; - i32 tmp_load_52 = load alloca_addr_66; - i32 tmp_53 = tmp_load_51 + tmp_load_52; - return tmp_53; - } - - instruct_tests_block3: { - i32 num_43 = 2; - i32 tmp_load_44 = load alloca_addr_66; - i32 tmp_45 = tmp_load_44 - num_43; - store tmp_45, alloca_addr_66; - jmp instruct_tests_block2; - } - - instruct_tests_block4: { - i32 tmp_load_46 = load alloca_addr_66; - i32 tmp_load_47 = load alloca_addr_60; - cjmp tmp_load_46 == tmp_load_47 ? instruct_tests_block6 : instruct_tests_block5; - } - - instruct_tests_block5: { - jmp instruct_tests_block2; - } - - instruct_tests_block6: { - i32 num_48 = 3; - i32 tmp_load_49 = load alloca_addr_66; - i32 tmp_50 = tmp_load_49 + num_48; - store tmp_50, alloca_addr_66; - jmp instruct_tests_block5; - } - -} -========================== -module main before optimization: -module main functions: 1, blocks: 7, instructions: 90 -========================== -module main; - -global function i32 instruct_tests(i32 a, i32 b) { - instruct_tests_block0: { - blob<4:4> alloca = alloc 4 bytes aligned at 4; - ptr alloca_addr = &alloca; - blob<4:4> alloca_55 = alloc 4 bytes aligned at 4; - ptr alloca_addr_56 = &alloca_55; - blob<4:4> alloca_57 = alloc 4 bytes aligned at 4; - ptr alloca_addr_58 = &alloca_57; - blob<4:4> alloca_59 = alloc 4 bytes aligned at 4; - ptr alloca_addr_60 = &alloca_59; - blob<4:4> alloca_61 = alloc 4 bytes aligned at 4; - ptr alloca_addr_62 = &alloca_61; - blob<8:4> alloca_63 = alloc 8 bytes aligned at 4; - ptr alloca_addr_64 = &alloca_63; - blob<4:4> alloca_65 = alloc 4 bytes aligned at 4; - ptr alloca_addr_66 = &alloca_65; - jmp instruct_tests_block1; - } - - instruct_tests_block1: { - store a, alloca_addr; - store b, alloca_addr_56; - i32 num = 0; - store num, alloca_addr_58; - ptr num_0 = 4; - ptr tmp = alloca_addr_58 + num_0; - i32 tmp_load = load alloca_addr; - i32 num_1 = 4; - i32 tmp_2 = tmp_load + num_1; - store tmp_2, alloca_addr_60; - ptr num_3 = 4; - ptr tmp_4 = alloca_addr_60 + num_3; - i32 tmp_load_5 = load alloca_addr_56; - i32 num_6 = 255; - i32 tmp_7 = tmp_load_5 & num_6; - store tmp_7, alloca_addr_62; - ptr num_8 = 4; - ptr tmp_9 = alloca_addr_62 + num_8; - i32 tmp_load_10 = load alloca_addr_60; - i32 tmp_load_11 = load alloca_addr_62; - i32 tmp_12 = tmp_load_10 ^ tmp_load_11; - i32 tmp_load_13 = load alloca_addr_58; - i32 tmp_14 = tmp_load_13 ^ tmp_12; - store tmp_14, alloca_addr_58; - i32 num_15 = 0; - ptr typecast = cast num_15; - ptr num_16 = 4; - ptr tmp_17 = typecast * num_16; - ptr tmp_18 = alloca_addr_64 + tmp_17; - i32 tmp_load_19 = load alloca_addr_58; - store tmp_load_19, tmp_18; - i32 num_20 = 1; - ptr typecast_21 = cast num_20; - ptr num_22 = 4; - ptr tmp_23 = typecast_21 * num_22; - ptr tmp_24 = alloca_addr_64 + tmp_23; - i32 num_25 = 0; - ptr typecast_26 = cast num_25; - ptr num_27 = 4; - ptr tmp_28 = typecast_26 * num_27; - ptr tmp_29 = alloca_addr_64 + tmp_28; - i32 tmp_load_30 = load tmp_29; - i32 num_31 = 1; - i32 tmp_32 = tmp_load_30 + num_31; - store tmp_32, tmp_24; - i32 num_33 = 1; - ptr typecast_34 = cast num_33; - ptr num_35 = 4; - ptr tmp_36 = typecast_34 * num_35; - ptr tmp_37 = alloca_addr_64 + tmp_36; - i32 tmp_load_38 = load tmp_37; - store tmp_load_38, alloca_addr_66; - ptr num_39 = 4; - ptr tmp_40 = alloca_addr_66 + num_39; - i32 tmp_load_41 = load alloca_addr_66; - i32 tmp_load_42 = load alloca_addr_60; - cjmp tmp_load_41 >= tmp_load_42 ? instruct_tests_block3 : instruct_tests_block4; - } - - instruct_tests_block2: { - i32 tmp_load_51 = load alloca_addr_58; - i32 tmp_load_52 = load alloca_addr_66; - i32 tmp_53 = tmp_load_51 + tmp_load_52; - return tmp_53; - } - - instruct_tests_block3: { - i32 num_43 = 2; - i32 tmp_load_44 = load alloca_addr_66; - i32 tmp_45 = tmp_load_44 - num_43; - store tmp_45, alloca_addr_66; - jmp instruct_tests_block2; - } - - instruct_tests_block4: { - i32 tmp_load_46 = load alloca_addr_66; - i32 tmp_load_47 = load alloca_addr_60; - cjmp tmp_load_46 == tmp_load_47 ? instruct_tests_block6 : instruct_tests_block5; - } - - instruct_tests_block5: { - jmp instruct_tests_block2; - } - - instruct_tests_block6: { - i32 num_48 = 3; - i32 tmp_load_49 = load alloca_addr_66; - i32 tmp_50 = tmp_load_49 + num_48; - store tmp_50, alloca_addr_66; - jmp instruct_tests_block5; - } - -} -========================== - -Code generation ---------------- - -Target: atalla-arch - -Log for global function i32 instruct_tests(i32 a, i32 b) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -========================== -global function i32 instruct_tests(i32 a, i32 b) { - instruct_tests_block0: { - blob<4:4> alloca = alloc 4 bytes aligned at 4; - ptr alloca_addr = &alloca; - blob<4:4> alloca_55 = alloc 4 bytes aligned at 4; - ptr alloca_addr_56 = &alloca_55; - blob<4:4> alloca_57 = alloc 4 bytes aligned at 4; - ptr alloca_addr_58 = &alloca_57; - blob<4:4> alloca_59 = alloc 4 bytes aligned at 4; - ptr alloca_addr_60 = &alloca_59; - blob<4:4> alloca_61 = alloc 4 bytes aligned at 4; - ptr alloca_addr_62 = &alloca_61; - blob<8:4> alloca_63 = alloc 8 bytes aligned at 4; - ptr alloca_addr_64 = &alloca_63; - blob<4:4> alloca_65 = alloc 4 bytes aligned at 4; - ptr alloca_addr_66 = &alloca_65; - jmp instruct_tests_block1; - } - - instruct_tests_block1: { - store a, alloca_addr; - store b, alloca_addr_56; - i32 num = 0; - store num, alloca_addr_58; - ptr num_0 = 4; - ptr tmp = alloca_addr_58 + num_0; - i32 tmp_load = load alloca_addr; - i32 num_1 = 4; - i32 tmp_2 = tmp_load + num_1; - store tmp_2, alloca_addr_60; - ptr num_3 = 4; - ptr tmp_4 = alloca_addr_60 + num_3; - i32 tmp_load_5 = load alloca_addr_56; - i32 num_6 = 255; - i32 tmp_7 = tmp_load_5 & num_6; - store tmp_7, alloca_addr_62; - ptr num_8 = 4; - ptr tmp_9 = alloca_addr_62 + num_8; - i32 tmp_load_10 = load alloca_addr_60; - i32 tmp_load_11 = load alloca_addr_62; - i32 tmp_12 = tmp_load_10 ^ tmp_load_11; - i32 tmp_load_13 = load alloca_addr_58; - i32 tmp_14 = tmp_load_13 ^ tmp_12; - store tmp_14, alloca_addr_58; - i32 num_15 = 0; - ptr typecast = cast num_15; - ptr num_16 = 4; - ptr tmp_17 = typecast * num_16; - ptr tmp_18 = alloca_addr_64 + tmp_17; - i32 tmp_load_19 = load alloca_addr_58; - store tmp_load_19, tmp_18; - i32 num_20 = 1; - ptr typecast_21 = cast num_20; - ptr num_22 = 4; - ptr tmp_23 = typecast_21 * num_22; - ptr tmp_24 = alloca_addr_64 + tmp_23; - i32 num_25 = 0; - ptr typecast_26 = cast num_25; - ptr num_27 = 4; - ptr tmp_28 = typecast_26 * num_27; - ptr tmp_29 = alloca_addr_64 + tmp_28; - i32 tmp_load_30 = load tmp_29; - i32 num_31 = 1; - i32 tmp_32 = tmp_load_30 + num_31; - store tmp_32, tmp_24; - i32 num_33 = 1; - ptr typecast_34 = cast num_33; - ptr num_35 = 4; - ptr tmp_36 = typecast_34 * num_35; - ptr tmp_37 = alloca_addr_64 + tmp_36; - i32 tmp_load_38 = load tmp_37; - store tmp_load_38, alloca_addr_66; - ptr num_39 = 4; - ptr tmp_40 = alloca_addr_66 + num_39; - i32 tmp_load_41 = load alloca_addr_66; - i32 tmp_load_42 = load alloca_addr_60; - cjmp tmp_load_41 >= tmp_load_42 ? instruct_tests_block3 : instruct_tests_block4; - } - - instruct_tests_block2: { - i32 tmp_load_51 = load alloca_addr_58; - i32 tmp_load_52 = load alloca_addr_66; - i32 tmp_53 = tmp_load_51 + tmp_load_52; - return tmp_53; - } - - instruct_tests_block3: { - i32 num_43 = 2; - i32 tmp_load_44 = load alloca_addr_66; - i32 tmp_45 = tmp_load_44 - num_43; - store tmp_45, alloca_addr_66; - jmp instruct_tests_block2; - } - - instruct_tests_block4: { - i32 tmp_load_46 = load alloca_addr_66; - i32 tmp_load_47 = load alloca_addr_60; - cjmp tmp_load_46 == tmp_load_47 ? instruct_tests_block6 : instruct_tests_block5; - } - - instruct_tests_block5: { - jmp instruct_tests_block2; - } - - instruct_tests_block6: { - i32 num_48 = 3; - i32 tmp_load_49 = load alloca_addr_66; - i32 tmp_50 = tmp_load_49 + num_48; - store tmp_50, alloca_addr_66; - jmp instruct_tests_block5; - } - -} -========================== -Selection trees: - instruct_tests_block0: - JMP[instruct_tests_block1:] - instruct_tests_block1: - STRI32(FPRELU32[Stack[4 bytes at -4]], REGI32[vreg0a]) - STRI32(FPRELU32[Stack[4 bytes at -8]], REGI32[vreg1b]) - STRI32(FPRELU32[Stack[4 bytes at -12]], CONSTI32[0]) - MOVI32[vreg3tmp_load](LDRI32(FPRELU32[Stack[4 bytes at -4]])) - STRI32(FPRELU32[Stack[4 bytes at -16]], ADDI32(REGI32[vreg3tmp_load], CONSTI32[4])) - MOVI32[vreg4tmp_load_5](LDRI32(FPRELU32[Stack[4 bytes at -8]])) - STRI32(FPRELU32[Stack[4 bytes at -20]], ANDI32(REGI32[vreg4tmp_load_5], CONSTI32[255])) - MOVI32[vreg5tmp_load_10](LDRI32(FPRELU32[Stack[4 bytes at -16]])) - MOVI32[vreg6tmp_load_11](LDRI32(FPRELU32[Stack[4 bytes at -20]])) - MOVI32[vreg7tmp_load_13](LDRI32(FPRELU32[Stack[4 bytes at -12]])) - STRI32(FPRELU32[Stack[4 bytes at -12]], XORI32(REGI32[vreg7tmp_load_13], XORI32(REGI32[vreg5tmp_load_10], REGI32[vreg6tmp_load_11]))) - MOVI32[vreg8tmp_load_19](LDRI32(FPRELU32[Stack[4 bytes at -12]])) - STRI32(ADDU32(FPRELU32[Stack[8 bytes at -28]], MULU32(CONSTI32[0], CONSTU32[4])), REGI32[vreg8tmp_load_19]) - MOVI32[vreg9tmp_load_30](LDRI32(ADDU32(FPRELU32[Stack[8 bytes at -28]], MULU32(CONSTI32[0], CONSTU32[4])))) - STRI32(ADDU32(FPRELU32[Stack[8 bytes at -28]], MULU32(CONSTI32[1], CONSTU32[4])), ADDI32(REGI32[vreg9tmp_load_30], CONSTI32[1])) - MOVI32[vreg10tmp_load_38](LDRI32(ADDU32(FPRELU32[Stack[8 bytes at -28]], MULU32(CONSTI32[1], CONSTU32[4])))) - STRI32(FPRELU32[Stack[4 bytes at -32]], REGI32[vreg10tmp_load_38]) - MOVI32[vreg11tmp_load_41](LDRI32(FPRELU32[Stack[4 bytes at -32]])) - MOVI32[vreg12tmp_load_42](LDRI32(FPRELU32[Stack[4 bytes at -16]])) - CJMPI32[('>=', instruct_tests_block3:, instruct_tests_block4:)](REGI32[vreg11tmp_load_41], REGI32[vreg12tmp_load_42]) - instruct_tests_block2: - MOVI32[vreg16tmp_load_51](LDRI32(FPRELU32[Stack[4 bytes at -12]])) - MOVI32[vreg17tmp_load_52](LDRI32(FPRELU32[Stack[4 bytes at -32]])) - MOVI32[vreg2retval](ADDI32(REGI32[vreg16tmp_load_51], REGI32[vreg17tmp_load_52])) - JMP[instruct_tests_epilog:] - instruct_tests_block3: - MOVI32[vreg13tmp_load_44](LDRI32(FPRELU32[Stack[4 bytes at -32]])) - STRI32(FPRELU32[Stack[4 bytes at -32]], SUBI32(REGI32[vreg13tmp_load_44], CONSTI32[2])) - JMP[instruct_tests_block2:] - instruct_tests_block4: - MOVI32[vreg14tmp_load_46](LDRI32(FPRELU32[Stack[4 bytes at -32]])) - MOVI32[vreg15tmp_load_47](LDRI32(FPRELU32[Stack[4 bytes at -16]])) - CJMPI32[('==', instruct_tests_block6:, instruct_tests_block5:)](REGI32[vreg14tmp_load_46], REGI32[vreg15tmp_load_47]) - instruct_tests_block5: - JMP[instruct_tests_block2:] - instruct_tests_block6: - MOVI32[vreg18tmp_load_49](LDRI32(FPRELU32[Stack[4 bytes at -32]])) - STRI32(FPRELU32[Stack[4 bytes at -32]], ADDI32(REGI32[vreg18tmp_load_49], CONSTI32[3])) - JMP[instruct_tests_block5:] - instruct_tests_epilog: -Frame instruct_tests -$ VUseDef -$ addi_s vreg0a, x12, 0 -$ addi_s vreg1b, x13, 0 -$ instruct_tests_block0: -$ jal x0, instruct_tests_block1 -$ instruct_tests_block1: -$ sw_s vreg0a, -4(x8) -$ sw_s vreg1b, -8(x8) -$ li_s vreg19, 0 -$ sw_s vreg19, -12(x8) -$ lw_s vreg20, -4(x8) -$ addi_s vreg3tmp_load, vreg20, 0 -$ addi_s vreg21, vreg3tmp_load, 4 -$ sw_s vreg21, -16(x8) -$ lw_s vreg22, -8(x8) -$ addi_s vreg4tmp_load_5, vreg22, 0 -$ andi_s vreg23, vreg4tmp_load_5, 255 -$ sw_s vreg23, -20(x8) -$ lw_s vreg24, -16(x8) -$ addi_s vreg5tmp_load_10, vreg24, 0 -$ lw_s vreg25, -20(x8) -$ addi_s vreg6tmp_load_11, vreg25, 0 -$ lw_s vreg26, -12(x8) -$ addi_s vreg7tmp_load_13, vreg26, 0 -$ xor_s vreg27, vreg5tmp_load_10, vreg6tmp_load_11 -$ xor_s vreg28, vreg7tmp_load_13, vreg27 -$ sw_s vreg28, -12(x8) -$ lw_s vreg29, -12(x8) -$ addi_s vreg8tmp_load_19, vreg29, 0 -$ addi_s vreg30, x8, -28 -$ li_s vreg31, 0 -$ li_s vreg32, 4 -$ mul_s vreg33, vreg31, vreg32 -$ add_s vreg34, vreg30, vreg33 -$ sw_s vreg8tmp_load_19, 0(vreg34) -$ addi_s vreg35, x8, -28 -$ li_s vreg36, 0 -$ li_s vreg37, 4 -$ mul_s vreg38, vreg36, vreg37 -$ add_s vreg39, vreg35, vreg38 -$ lw_s vreg40, 0(vreg39) -$ addi_s vreg9tmp_load_30, vreg40, 0 -$ addi_s vreg41, x8, -28 -$ li_s vreg42, 1 -$ li_s vreg43, 4 -$ mul_s vreg44, vreg42, vreg43 -$ add_s vreg45, vreg41, vreg44 -$ addi_s vreg46, vreg9tmp_load_30, 1 -$ sw_s vreg46, 0(vreg45) -$ addi_s vreg47, x8, -28 -$ li_s vreg48, 1 -$ li_s vreg49, 4 -$ mul_s vreg50, vreg48, vreg49 -$ add_s vreg51, vreg47, vreg50 -$ lw_s vreg52, 0(vreg51) -$ addi_s vreg10tmp_load_38, vreg52, 0 -$ sw_s vreg10tmp_load_38, -32(x8) -$ lw_s vreg53, -32(x8) -$ addi_s vreg11tmp_load_41, vreg53, 0 -$ lw_s vreg54, -16(x8) -$ addi_s vreg12tmp_load_42, vreg54, 0 -$ bge_s vreg11tmp_load_41, vreg12tmp_load_42, instruct_tests_block3 -$ jal x0, instruct_tests_block4 -$ instruct_tests_block2: -$ lw_s vreg55, -12(x8) -$ addi_s vreg16tmp_load_51, vreg55, 0 -$ lw_s vreg56, -32(x8) -$ addi_s vreg17tmp_load_52, vreg56, 0 -$ add_s vreg57, vreg16tmp_load_51, vreg17tmp_load_52 -$ addi_s vreg2retval, vreg57, 0 -$ jal x0, instruct_tests_epilog -$ instruct_tests_block3: -$ lw_s vreg58, -32(x8) -$ addi_s vreg13tmp_load_44, vreg58, 0 -$ li_s vreg59, 2 -$ sub_s vreg60, vreg13tmp_load_44, vreg59 -$ sw_s vreg60, -32(x8) -$ jal x0, instruct_tests_block2 -$ instruct_tests_block4: -$ lw_s vreg61, -32(x8) -$ addi_s vreg14tmp_load_46, vreg61, 0 -$ lw_s vreg62, -16(x8) -$ addi_s vreg15tmp_load_47, vreg62, 0 -$ beq_s vreg14tmp_load_46, vreg15tmp_load_47, instruct_tests_block6 -$ jal x0, instruct_tests_block5 -$ instruct_tests_block5: -$ jal x0, instruct_tests_block2 -$ instruct_tests_block6: -$ lw_s vreg63, -32(x8) -$ addi_s vreg18tmp_load_49, vreg63, 0 -$ addi_s vreg64, vreg18tmp_load_49, 3 -$ sw_s vreg64, -32(x8) -$ jal x0, instruct_tests_block5 -$ instruct_tests_epilog: -$ addi_s x10, vreg2retval, 0 -$ VUseDef -Frame instruct_tests -$ VUseDef diff --git a/main.py b/main.py index abc30d17..c204f5c1 100644 --- a/main.py +++ b/main.py @@ -6,7 +6,7 @@ import sys def main(): - with open("instructtest.c", "r") as source: + with open("vectortest.c", "r") as source: #cc(f, "atalla") with open("amps.s", "w") as f: reporter = TextReportGenerator(f) diff --git a/ppci/arch/.DS_Store b/ppci/arch/.DS_Store new file mode 100644 index 00000000..fffd187e Binary files /dev/null and b/ppci/arch/.DS_Store differ diff --git a/ppci/arch/amp/arch.py b/ppci/arch/amp/arch.py index 6a5e331a..a8a24bd3 100644 --- a/ppci/arch/amp/arch.py +++ b/ppci/arch/amp/arch.py @@ -62,6 +62,77 @@ Section, dcd ) + +from .vector_instructions import ( + # Vector-Vector + DivVV, + MulVV, + AddVV, + AndVV, + OrVV, + XorVV, + MgtiVV, + MltiVV, + MeqiVV, + # Vector-Unary + NotV, + ExpV, + SqrtV, + MclrV, + # Vector-Immediate + MsetVI, + RsumVI, + RminVI, + RmaxVI, + AddiVI, + SubiVI, + MuliVI, + DiviVI, + ExpiVI, + SqrtiVI, + VshrVI, + # Vector-Scalar + VmovVS, + SmovVS, + AddVS, + SubVS, + MulVS, + DivVS, + MgtiVS, + MltiVS, + MeqiVS, + MneqiVS, + # Vector Memory + VregLd, + VregSt, + # Vector Shift + ShiftVX, + # Systolic Array + GemmSA, + StkSysSt, + LdkSysLd, + # Scratchpad DMA + ScpadLd, + ScpadSt, + # Systolic Array Memory + SysarrayLdtile, + # TCA + SysarrayGemm, + SysarrayConv, + # List of all vector instructions + vector_instructions +) + +from .vector_registers import ( + V0, V1, V2, V3, V4, V5, V6, V7, + V8, V9, V10, V11, V12, V13, V14, V15, + V16, V17, V18, V19, V20, V21, V22, V23, + V24, V25, V26, V27, V28, V29, V30, V31, + AtallaVectorRegister, + vector_registers, + vector_register_class, +) + from .registers import ( R0, LR, diff --git a/ppci/arch/amp/tokens.py b/ppci/arch/amp/tokens.py index 6f71d613..135002e0 100644 --- a/ppci/arch/amp/tokens.py +++ b/ppci/arch/amp/tokens.py @@ -1,143 +1,125 @@ from ..token import Token, bit_range -#since our instruction defines imm12 everywhere we will use that in each of the classes. -#we can change later but I'll add a quick fix for it right now - -class AtallaSDMAToken(Token): - class Info: - size = 64 - - opcode = bit_range(57, 63) - rd1 = bit_range(49, 56) - rs1 = bit_range(41, 48) - rs2 = bit_range(33, 40) - imm12 = bit_range(15, 32) - schdImm = bit_range(0, 4) - -class AtallaVMToken(Token): - class Info: - size = 64 - - opcode = bit_range(57, 63) - rd1 = bit_range(49, 56) - rs1 = bit_range(41, 48) - rs2 = bit_range(33, 40) - imm12 = bit_range(15, 32) - schdImm = bit_range(0, 4) - -class AtallaSAMToken(Token): - class Info: - size = 64 - - opcode = bit_range(57, 63) - rd1 = bit_range(49, 56) - rs1 = bit_range(41, 48) - rs2 = bit_range(33, 40) - imm12 = bit_range(15, 32) - schdImm = bit_range(0, 4) - -class AtallaTCAToken(Token): - class Info: - size = 64 - - opcode = bit_range(57, 63) - rd1 = bit_range(49, 56) - rs1 = bit_range(41, 48) - rs2 = bit_range(33, 40) - imm12 = bit_range(15, 32) - schdImm = bit_range(0, 4) - - -# Scalar class AtallaRToken(Token): class Info: - size = 64 - - opcode = bit_range(57, 63) - rd1 = bit_range(49, 56) - rs1 = bit_range(41, 48) - rs2 = bit_range(17, 24) - imm12 = bit_range(15, 32) - schdImm = bit_range(0, 4) + size = 40 + opcode = bit_range(0, 6) + rd1 = bit_range(7, 14) + rs1 = bit_range(15, 22) + rs2 = bit_range(23, 30) + reserved = bit_range(31, 39) class AtallaBRToken(Token): class Info: - size = 64 - - opcode = bit_range(57, 63) - rd1 = bit_range(49, 56) - rs1 = bit_range(41, 48) - rs2 = bit_range(17, 24) - imm12 = bit_range(5, 16) - schdImm = bit_range(0, 4) + size = 40 + opcode = bit_range(0, 6) + incr_imm7 = bit_range(7, 13) + i1 = bit_range(14, 14) + rs1 = bit_range(15, 22) + rs2 = bit_range(23, 30) + imm9 = bit_range(31, 39) class AtallaIToken(Token): class Info: - size = 64 - - opcode = bit_range(57, 63) - rd = bit_range(49, 56) - rs1 = bit_range(41, 48) - rs2 = bit_range(17, 24) - imm12 = bit_range(5, 16) - schdImm = bit_range(0, 4) + size = 40 + opcode = bit_range(0, 6) + rd1 = bit_range(7, 14) + rs1 = bit_range(15, 22) + imm12 = bit_range(23, 34) + reserved = bit_range(35, 39) class AtallaMToken(Token): class Info: - size = 64 - - opcode = bit_range(57, 63) - rd = bit_range(49, 56) - rs1 = bit_range(41, 48) - rs2 = bit_range(17, 24) - imm12 = bit_range(5, 16) - schdImm = bit_range(0, 4) + size = 40 + opcode = bit_range(0, 6) + rd1 = bit_range(7, 14) + rs1 = bit_range(15, 22) + imm12 = bit_range(23, 34) + reserved = bit_range(35, 39) class AtallaMIToken(Token): class Info: - size = 64 - - opcode = bit_range(57, 63) - rd = bit_range(49, 56) - imm = bit_range(5, 48) - schdImm = bit_range(0, 4) + size = 40 + opcode = bit_range(0, 6) + rd1 = bit_range(7, 14) + imm25 = bit_range(15, 39) -class AtallaJToken(Token): +class AtallaSToken(Token): class Info: - size = 64 + size = 40 + opcode = bit_range(0, 6) + imm = bit_range(7, 39) - opcode = bit_range(57, 63) - rd1 = bit_range(49, 56) - rs1 = bit_range(41, 48) - imm20 = bit_range(5, 24) - schdImm = bit_range(0, 4) +#vector +class AtallaVVToken(Token): -class AtallaFenceToken(Token): class Info: - size = 64 - - opcode = bit_range(57, 63) - imm = bit_range(15, 56) - schdImm = bit_range(0, 4) - -class AtallaHaltToken(Token): + size = 40 + opcode = bit_range(0, 6) + vd = bit_range(7, 14) + vs1 = bit_range(15, 22) + vs2 = bit_range(23, 30) + mask = bit_range(31, 34) + sac = bit_range(35, 35) + reserved = bit_range(36, 39) + +class AtallaVSToken(Token): class Info: - size = 64 - - opcode = bit_range(57, 63) - imm = bit_range(15, 56) - schdImm = bit_range(0, 4) - -class AtallaNOPToken(Token): + size = 40 + opcode = bit_range(0, 6) + vd = bit_range(7, 14) + vs1 = bit_range(15, 22) + rs1 = bit_range(23, 30) + mask = bit_range(31, 34) + reserved = bit_range(35, 39) + +class AtallaVIToken(Token): + class Info: + size = 40 + opcode = bit_range(0, 6) + vd = bit_range(7, 14) + vs1 = bit_range(15, 22) + imm8 = bit_range(23, 30) + mask = bit_range(31, 34) + imm5 = bit_range(35, 39) + +class AtallaVMemToken(Token): class Info: - size = 64 + size = 40 + opcode = bit_range(0, 6) + vd = bit_range(7, 14) + rs1 = bit_range(15, 22) + tile_r_c_count = bit_range(23, 27) + rc = bit_range(28, 28) + sp = bit_range(29, 30) + mask = bit_range(31, 34) + rc_id = bit_range(35, 39) + +#next +class AtallaSDMAToken(Token): + class Info: + size = 40 + opcode = bit_range(0, 6) + rs1_rd1 = bit_range(7, 14) + rs2 = bit_range(15, 22) + num_rows = bit_range(23, 27) + num_cols = bit_range(28, 32) + sid = bit_range(33, 33) + reserved = bit_range(34, 39) - opcode = bit_range(57, 63) - imm = bit_range(15, 56) - schdImm = bit_range(0, 4) -class DwordToken(Token): +class AtallaMTSToken(Token): class Info: - size = 32 + size = 40 + opcode = bit_range(0, 6) + rd1 = bit_range(7, 14) + vs1 = bit_range(15, 22) + reserved = bit_range(23, 39) - value = bit_range(0, 32) +class AtallaSTMToken(Token): + class Info: + size = 40 + opcode = bit_range(0, 6) + vd = bit_range(7, 14) + rs1 = bit_range(15, 22) + reserved = bit_range(23, 39) diff --git a/ppci/arch/amp/vector_instructions.py b/ppci/arch/amp/vector_instructions.py new file mode 100644 index 00000000..9611fe2f --- /dev/null +++ b/ppci/arch/amp/vector_instructions.py @@ -0,0 +1,305 @@ +from ..encoding import Instruction, Operand, Syntax +from ..isa import ISA + +from .tokens import ( + AtallaVVToken, + AtallaVSToken, + AtallaVIToken, + AtallaVMemToken, +) +from .vector_registers import AtallaVectorRegister +from .registers import AtallaRegister + +isa = ISA() + +class AtallaVVInstruction(Instruction): + tokens = [AtallaVVToken] + + +class AtallaVSInstruction(Instruction): + tokens = [AtallaVSToken] + + +class AtallaVIInstruction(Instruction): + tokens = [AtallaVIToken] + + +class AtallaVMemInstruction(Instruction): + tokens = [AtallaVMemToken] + + +def make_vv(mnemonic: str, opcode: int, *, default_mask: int = 0, default_sac: int = 0): + vd = Operand("vd", AtallaVectorRegister, write=True) + vs1 = Operand("vs1", AtallaVectorRegister, read=True) + vs2 = Operand("vs2", AtallaVectorRegister, read=True) + syntax = Syntax([mnemonic, " ", vd, ", ", vs1, ", ", vs2]) + patterns = {"opcode": opcode, "vd": vd, "vs1": vs1, "vs2": vs2, "mask": default_mask, "sac": default_sac} + members = {"syntax": syntax, "vd": vd, "vs1": vs1, "vs2": vs2, "patterns": patterns, "opcode": opcode} + return type(mnemonic.replace(".", "_"), (AtallaVVInstruction,), members) + + +def make_vs(mnemonic: str, opcode: int, *, default_mask: int = 0): + vd = Operand("vd", AtallaVectorRegister, write=True) + vs1 = Operand("vs1", AtallaVectorRegister, read=True) + rs1 = Operand("rs1", AtallaRegister, read=True) + syntax = Syntax([mnemonic, " ", vd, ", ", vs1, ", ", rs1]) + patterns = {"opcode": opcode, "vd": vd, "vs1": vs1, "rs1": rs1, "mask": default_mask} + members = {"syntax": syntax, "vd": vd, "vs1": vs1, "rs1": rs1, "patterns": patterns, "opcode": opcode} + return type(mnemonic.replace(".", "_"), (AtallaVSInstruction,), members) + + +def make_vi(mnemonic: str, opcode: int, *, default_mask: int = 0): + vd = Operand("vd", AtallaVectorRegister, write=True) + vs1 = Operand("vs1", AtallaVectorRegister, read=True) + imm8 = Operand("imm8", int) + imm5 = Operand("imm5", int) + syntax = Syntax([mnemonic, " ", vd, ", ", vs1, ", ", imm8, ", ", imm5]) + patterns = {"opcode": opcode, "vd": vd, "vs1": vs1, "imm8": imm8, "imm5": imm5, "mask": default_mask} + members = {"syntax": syntax, "vd": vd, "vs1": vs1, "imm8": imm8, "imm5": imm5, "patterns": patterns, "opcode": opcode} + return type(mnemonic.replace(".", "_"), (AtallaVIInstruction,), members) + + +def make_vm(mnemonic: str, opcode: int, *, + default_tile_r_c_count: int = 0, default_rc: int = 0, default_sp: int = 0, + default_mask: int = 0, default_rc_id: int = 0): + vd = Operand("vd", AtallaVectorRegister, write=True) + rs1 = Operand("rs1", AtallaRegister, read=True) + syntax = Syntax([mnemonic, " ", vd, ", ", rs1]) + patterns = { + "opcode": opcode, + "vd": vd, "rs1": rs1, + "tile_r_c_count": default_tile_r_c_count, + "rc": default_rc, "sp": default_sp, + "mask": default_mask, "rc_id": default_rc_id, + } + members = {"syntax": syntax, "vd": vd, "rs1": rs1, "patterns": patterns, "opcode": opcode} + return type(mnemonic.replace(".", "_"), (AtallaVMemInstruction,), members) + + +# VV +AddVv = make_vv("add.vv", 0b0101000) +SubVv = make_vv("sub.vv", 0b0101001) +MulVv = make_vv("mul.vv", 0b0101010) +DivVv = make_vv("div.vv", 0b0101011) +AndVv = make_vv("and.vv", 0b0101100) +OrVv = make_vv("or.vv", 0b0101101) +XorVv = make_vv("xor.vv", 0b0101110) +GemmVv = make_vv("gemm.vv", 0b0101111) +MgtVv = make_vv("mgt.vv", 0b0110000) +MltVv = make_vv("mlt.vv", 0b0110001) +MeqVv = make_vv("meq.vv", 0b0110010) +MneqVv = make_vv("mneq.vv", 0b0110011) + +# VI +AddiVi = make_vi("addi.vi", 0b0110100) +SubiVi = make_vi("subi.vi", 0b0110101) +MuliVi = make_vi("muli.vi", 0b0110110) +DiviVi = make_vi("divi.vi", 0b0110111) +ExpiVi = make_vi("expi.vi", 0b0111000) +SqrtiVi = make_vi("sqrti.vi", 0b0111001) +NotVi = make_vi("not.vi", 0b0111010) +ShiftVi = make_vi("shift.vi", 0b0111011) +LwVi = make_vi("lw.vi", 0b0111100) +RsumVi = make_vi("rsum.vi", 0b0111101) +RminVi = make_vi("rmin.vi", 0b0111110) +RmaxVi = make_vi("rmax.vi", 0b0111111) + +# VS +ShiftVs = make_vs("shift.vs", 0b0111000) + +# VM +# VregLd = make_vm("vreg.ld", ) +# VregSt = make_vm("vreg.st", ) + +def _new_v(context): + return context.new_reg(AtallaVectorRegister) + +def _new_s(context): + return context.new_reg(AtallaRegister) + +def _split_imm13_signed(val: int): + # 13-bit signed: range [-4096, 4095] + if val < -4096 or val > 4095: + raise ValueError("imm13 out of range") + # two's complement form in 13 bits + u = val & 0x1FFF + imm8 = (u >> 5) & 0xFF + imm5 = u & 0x1F + return imm8, imm5 + +# ---------- VV (vector-vector) ---------- + +@isa.pattern("vreg", "ADDVV(vreg, vreg)", size=2) +def patt_add_vv(ctx, tree, v0, v1): + d = _new_v(ctx) + ctx.emit(AddVv(d, v0, v1)) + return d + +@isa.pattern("vreg", "SUBVV(vreg, vreg)", size=2) +def patt_sub_vv(ctx, tree, v0, v1): + d = _new_v(ctx) + ctx.emit(SubVv(d, v0, v1)) + return d + +@isa.pattern("vreg", "MULVV(vreg, vreg)", size=2) +def patt_mul_vv(ctx, tree, v0, v1): + d = _new_v(ctx) + ctx.emit(MulVv(d, v0, v1)) + return d + +@isa.pattern("vreg", "DIVVV(vreg, vreg)", size=2) +def patt_div_vv(ctx, tree, v0, v1): + d = _new_v(ctx) + ctx.emit(DivVv(d, v0, v1)) + return d + +@isa.pattern("vreg", "ANDVV(vreg, vreg)", size=2) +def patt_and_vv(ctx, tree, v0, v1): + d = _new_v(ctx) + ctx.emit(AndVv(d, v0, v1)) + return d + +@isa.pattern("vreg", "ORVV(vreg, vreg)", size=2) +def patt_or_vv(ctx, tree, v0, v1): + d = _new_v(ctx) + ctx.emit(OrVv(d, v0, v1)) + return d + +@isa.pattern("vreg", "XORVV(vreg, vreg)", size=2) +def patt_xor_vv(ctx, tree, v0, v1): + d = _new_v(ctx) + ctx.emit(XorVv(d, v0, v1)) + return d + +@isa.pattern("vreg", "GEMMVV(vreg, vreg)", size=2) +def patt_gemm_vv(ctx, tree, v0, v1): + d = _new_v(ctx) + ctx.emit(GemmVv(d, v0, v1)) + return d + +@isa.pattern("vreg", "MGTVV(vreg, vreg)", size=2) +def patt_mgt_vv(ctx, tree, v0, v1): + d = _new_v(ctx) + ctx.emit(MgtVv(d, v0, v1)) + return d + +@isa.pattern("vreg", "MLTVV(vreg, vreg)", size=2) +def patt_mlt_vv(ctx, tree, v0, v1): + d = _new_v(ctx) + ctx.emit(MltVv(d, v0, v1)) + return d + +@isa.pattern("vreg", "MEQVV(vreg, vreg)", size=2) +def patt_meq_vv(ctx, tree, v0, v1): + d = _new_v(ctx) + ctx.emit(MeqVv(d, v0, v1)) + return d + +@isa.pattern("vreg", "MNEQVV(vreg, vreg)", size=2) +def patt_mneq_vv(ctx, tree, v0, v1): + d = _new_v(ctx) + ctx.emit(MneqVv(d, v0, v1)) + return d + +# ---------- VI (vector-immediate; 13-bit signed immediate) ---------- + +def _emit_vi_binop(ctx, d, vsrc, imm, InsnClass): + imm8, imm5 = _split_imm13_signed(imm) + ctx.emit(InsnClass(d, vsrc, imm8, imm5)) + +# ADDI +@isa.pattern("vreg", "ADDVI(vreg, CONSTI32)", size=2, + condition=lambda t: -4096 <= t.children[1].value <= 4095) +def patt_add_vi(ctx, tree, vsrc): + d = _new_v(ctx) + imm = tree.children[1].value + _emit_vi_binop(ctx, d, vsrc, imm, AddiVi) + return d + +# SUBI +@isa.pattern("vreg", "SUBVI(vreg, CONSTI32)", size=2, + condition=lambda t: -4096 <= t.children[1].value <= 4095) +def patt_sub_vi(ctx, tree, vsrc): + d = _new_v(ctx) + imm = tree.children[1].value + _emit_vi_binop(ctx, d, vsrc, imm, SubiVi) + return d + +# MULI +@isa.pattern("vreg", "MULVI(vreg, CONSTI32)", size=2, + condition=lambda t: -4096 <= t.children[1].value <= 4095) +def patt_mul_vi(ctx, tree, vsrc): + d = _new_v(ctx) + imm = tree.children[1].value + _emit_vi_binop(ctx, d, vsrc, imm, MuliVi) + return d + +# DIVI +@isa.pattern("vreg", "DIVVI(vreg, CONSTI32)", size=2, + condition=lambda t: -4096 <= t.children[1].value <= 4095) +def patt_div_vi(ctx, tree, vsrc): + d = _new_v(ctx) + imm = tree.children[1].value + _emit_vi_binop(ctx, d, vsrc, imm, DiviVi) + return d + +# EXP (immediate exponent) +@isa.pattern("vreg", "EXPVI(vreg, CONSTI32)", size=2, + condition=lambda t: -4096 <= t.children[1].value <= 4095) +def patt_exp_vi(ctx, tree, vsrc): + d = _new_v(ctx) + imm = tree.children[1].value + _emit_vi_binop(ctx, d, vsrc, imm, ExpiVi) + return d + +# SQRT (mode/precision as imm if your ISA uses it) +@isa.pattern("vreg", "SQRTVI(vreg, CONSTI32)", size=2, + condition=lambda t: -4096 <= t.children[1].value <= 4095) +def patt_sqrt_vi(ctx, tree, vsrc): + d = _new_v(ctx) + imm = tree.children[1].value + _emit_vi_binop(ctx, d, vsrc, imm, SqrtiVi) + return d + +# NOT (use imm as a control/mask if required by your ISA; 0 is typical) +@isa.pattern("vreg", "NOTVI(vreg, CONSTI32)", size=2, + condition=lambda t: -4096 <= t.children[1].value <= 4095) +def patt_not_vi(ctx, tree, vsrc): + d = _new_v(ctx) + imm = tree.children[1].value + _emit_vi_binop(ctx, d, vsrc, imm, NotVi) + return d + +# SHIFT (vector by immediate) +@isa.pattern("vreg", "SHIFTVI(vreg, CONSTI32)", size=2, + condition=lambda t: -4096 <= t.children[1].value <= 4095) +def patt_shift_vi(ctx, tree, vsrc): + d = _new_v(ctx) + imm = tree.children[1].value + _emit_vi_binop(ctx, d, vsrc, imm, ShiftVi) + return d + +# Fallback for VI when imm doesn't fit: lift to VS by materializing scalar +def _materialize_scalar_const(ctx, val: int): + s = _new_s(ctx) + # Reuse your existing scalar constant pattern helpers; if not available, emit li_s/addi_s etc. + # Here we assume context has a utility to place an immediate into a scalar reg. + ctx.emit_li(s, val) # If you don't have ctx.emit_li, replace with your scalar sequence. + return s + +@isa.pattern("vreg", "ADDVI(vreg, CONSTI32)", size=4, + condition=lambda t: not (-4096 <= t.children[1].value <= 4095)) +def patt_add_vi_wide(ctx, tree, vsrc): + d = _new_v(ctx) + val = tree.children[1].value + s = _materialize_scalar_const(ctx, val) + ctx.emit(ShiftVs(d, vsrc, s)) # Replace with AddVs if/when you add the VS add opcode + return d + +# ---------- VS (vector-scalar) ---------- + +@isa.pattern("vreg", "SHIFTVS(vreg, reg)", size=2) +def patt_shift_vs(ctx, tree, vsrc, sreg): + d = _new_v(ctx) + ctx.emit(ShiftVs(d, vsrc, sreg)) + return d diff --git a/ppci/arch/amp/vector_registers.py b/ppci/arch/amp/vector_registers.py new file mode 100644 index 00000000..8617ce8c --- /dev/null +++ b/ppci/arch/amp/vector_registers.py @@ -0,0 +1,63 @@ +from ... import ir +from ..registers import Register, RegisterClass + +class AtallaVectorRegister(Register): + """Vector register for SIMD operations""" + bitsize = 40 + + def __repr__(self): + if self.is_colored: + return f"v{self.color}" + else: + return self.name + +V0 = AtallaVectorRegister("v0", num=0) +V1 = AtallaVectorRegister("v1", num=1) +V2 = AtallaVectorRegister("v2", num=2) +V3 = AtallaVectorRegister("v3", num=3) +V4 = AtallaVectorRegister("v4", num=4) +V5 = AtallaVectorRegister("v5", num=5) +V6 = AtallaVectorRegister("v6", num=6) +V7 = AtallaVectorRegister("v7", num=7) +V8 = AtallaVectorRegister("v8", num=8) +V9 = AtallaVectorRegister("v9", num=9) +V10 = AtallaVectorRegister("v10", num=10) +V11 = AtallaVectorRegister("v11", num=11) +V12 = AtallaVectorRegister("v12", num=12) +V13 = AtallaVectorRegister("v13", num=13) +V14 = AtallaVectorRegister("v14", num=14) +V15 = AtallaVectorRegister("v15", num=15) +V16 = AtallaVectorRegister("v16", num=16) +V17 = AtallaVectorRegister("v17", num=17) +V18 = AtallaVectorRegister("v18", num=18) +V19 = AtallaVectorRegister("v19", num=19) +V20 = AtallaVectorRegister("v20", num=20) +V21 = AtallaVectorRegister("v21", num=21) +V22 = AtallaVectorRegister("v22", num=22) +V23 = AtallaVectorRegister("v23", num=23) +V24 = AtallaVectorRegister("v24", num=24) +V25 = AtallaVectorRegister("v25", num=25) +V26 = AtallaVectorRegister("v26", num=26) +V27 = AtallaVectorRegister("v27", num=27) +V28 = AtallaVectorRegister("v28", num=28) +V29 = AtallaVectorRegister("v29", num=29) +V30 = AtallaVectorRegister("v30", num=30) +V31 = AtallaVectorRegister("v31", num=31) + +vector_registers = [ + V0, V1, V2, V3, V4, V5, V6, V7, + V8, V9, V10, V11, V12, V13, V14, V15, + V16, V17, V18, V19, V20, V21, V22, V23, + V24, V25, V26, V27, V28, V29, V30, V31 +] + +AtallaVectorRegister.registers = vector_registers + +vector_register_class = RegisterClass( + "vreg", + [ir.f32, ir.f64], + AtallaVectorRegister, + [V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, + V11, V12, V13, V14, V15, V16, V17, V18, V19, V20, + V21, V22, V23, V24, V25, V26, V27, V28, V29, V30, V31] +) diff --git a/vectortest.c b/vectortest.c new file mode 100644 index 00000000..bb278ece --- /dev/null +++ b/vectortest.c @@ -0,0 +1,37 @@ +#include + +// Keep the signature intentionally similar in spirit to your scalar test. +// Arrays are required because vectors load/store multiple elements. +int vector_instruct_tests(const int *A, const int *B, int *C, int stride_bytes, int k_scalar) { + // We'll compute a scalar checksum after vector work, to mirror your return style. + int checksum = 0; + + // ===== VI: set mask/lanes ===== + // Enable the lowest 4 lanes via immediate mask 0xF (binary 1111). + // This "documents" VI usage even if your current backend treats it as a no-op. + asm volatile("mset.vi x9, x9, 0xF"); + + // ===== VM: load vectors from memory ===== + // vreg.ld vd, base(rs1), aux/stride(rs2), imm + // Here: load A into x10, B into x12, using the same stride and imm=0. + asm volatile("vreg.ld x10, %0, %1, 0" :: "r"(A), "r"(stride_bytes)); + asm volatile("vreg.ld x12, %0, %1, 0" :: "r"(B), "r"(stride_bytes)); + + // ===== VV: elementwise add ===== + // x10 = x10 + x12 (C = A + B) + asm volatile("add.vv x10, x10, x12"); + + // ===== VS: add a scalar k to every lane ===== + // x10 = x10 + k_scalar + asm volatile("add.vs x10, x10, %0" :: "r"(k_scalar)); + + // ===== VM: store result vector to memory ===== + asm volatile("vreg.st x10, %0, %1, 0" :: "r"(C), "r"(stride_bytes)); + + // ===== Scalar checksum (like your r_acc + mem_val) ===== + // XOR all four elements, then add the last one (mimics your pattern of mixing ops). + for (int i = 0; i < 4; ++i) checksum ^= C[i]; + checksum += C[3]; + + return checksum; +}