Skip to content
Draft
10 changes: 10 additions & 0 deletions gcc/config/riscv/arcv-rhx100.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,16 @@
condmove,mvpair,zicond,cpop,clmul"))
"((arcv_rhx100_issueA_fuse0 + arcv_rhx100_ALU_A_fuse0_early) | (arcv_rhx100_issueA_fuse1 + arcv_rhx100_ALU_A_fuse1_early)) | ((arcv_rhx100_issueB_fuse0 + arcv_rhx100_ALU_B_fuse0_early) | (arcv_rhx100_issueB_fuse1 + arcv_rhx100_ALU_B_fuse1_early))")

(define_insn_reservation "arcv_rhx100_imul_fused" 4
(and (eq_attr "tune" "arcv_rhx100")
(eq_attr "type" "imul_fused"))
"(arcv_rhx100_issueA_fuse0 + arcv_rhx100_issueA_fuse1 + arcv_rhx100_ALU_A_fuse0_early + arcv_rhx100_ALU_A_fuse1_early + arcv_rhx100_MPY32), nothing*3")

(define_insn_reservation "arcv_rhx100_alu_fused" 1
(and (eq_attr "tune" "arcv_rhx100")
(eq_attr "type" "alu_fused"))
"(arcv_rhx100_issueA_fuse0 + arcv_rhx100_issueA_fuse1 + arcv_rhx100_ALU_A_fuse0_early + arcv_rhx100_ALU_A_fuse1_early) | (arcv_rhx100_issueB_fuse0 + arcv_rhx100_issueB_fuse1 + arcv_rhx100_ALU_B_fuse0_early + arcv_rhx100_ALU_B_fuse1_early)")

(define_insn_reservation "arcv_rhx100_jmp_insn" 1
(and (eq_attr "tune" "arcv_rhx100")
(eq_attr "type" "branch,jump,call,jalr,ret,trap"))
Expand Down
2 changes: 2 additions & 0 deletions gcc/config/riscv/iterators.md
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,8 @@
(zero_extract "srliw")])
(define_code_attr extract_shift [(sign_extract "ashiftrt")
(zero_extract "lshiftrt")])
(define_code_attr is_zero_extract [(sign_extract "false")
(zero_extract "true")])

;; This code iterator allows the two right shift instructions to be
;; generated from the same template.
Expand Down
3 changes: 2 additions & 1 deletion gcc/config/riscv/riscv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4392,7 +4392,8 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
}
gcc_fallthrough ();
case SIGN_EXTRACT:
if (TARGET_XTHEADBB && outer_code == SET
if ((TARGET_ARCV_RHX100 || TARGET_XTHEADBB)
&& outer_code == SET
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI, this was added for the bit-extract fusion.

&& CONST_INT_P (XEXP (x, 1))
&& CONST_INT_P (XEXP (x, 2)))
{
Expand Down
125 changes: 123 additions & 2 deletions gcc/config/riscv/riscv.md
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@
vslideup,vslidedown,vislide1up,vislide1down,vfslide1up,vfslide1down,
vgather,vcompress,vmov,vector,vandn,vbrev,vbrev8,vrev8,vclz,vctz,vcpop,vrol,vror,vwsll,
vclmul,vclmulh,vghsh,vgmul,vaesef,vaesem,vaesdf,vaesdm,vaeskf1,vaeskf2,vaesz,
vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c,vfncvtbf16,vfwcvtbf16,vfwmaccbf16,
vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c,vfncvtbf16,vfwcvtbf16,vfwmaccbf16,imul_fused,alu_fused,
sf_vc,sf_vc_se"
(cond [(eq_attr "got" "load") (const_string "load")

Expand Down Expand Up @@ -3072,6 +3072,7 @@
;; * Single-bit extraction (SFB)
;; * Extraction instruction th.ext(u) (XTheadBb)
;; * lshrsi3_extend_2 (see above)
;; * Zero extraction fusion (ARC-V)
(define_insn_and_split "*<any_extract:optab><GPR:mode>3"
[(set (match_operand:GPR 0 "register_operand" "=r")
(any_extract:GPR
Expand All @@ -3084,6 +3085,8 @@
&& (INTVAL (operands[2]) == 1))
&& !TARGET_XTHEADBB
&& !TARGET_XANDESPERF
&& !(TARGET_ARCV_RHX100
&& <any_extract:is_zero_extract>)
&& !(TARGET_64BIT
&& (INTVAL (operands[3]) > 0)
&& (INTVAL (operands[2]) + INTVAL (operands[3]) == 32))"
Expand Down Expand Up @@ -4524,7 +4527,63 @@
(mult:SI (sign_extend:SI (match_operand:HI 1 "register_operand"))
(sign_extend:SI (match_operand:HI 2 "register_operand")))
(match_operand:SI 3 "register_operand")))]
"TARGET_XTHEADMAC"
"TARGET_XTHEADMAC || (TARGET_ARCV_RHX100
&& !TARGET_64BIT && (TARGET_ZMMUL || TARGET_MUL))"
{
if (TARGET_ARCV_RHX100)
{
rtx tmp0 = gen_reg_rtx (SImode), tmp1 = gen_reg_rtx (SImode);
emit_insn (gen_extendhisi2 (tmp0, operands[1]));
emit_insn (gen_extendhisi2 (tmp1, operands[2]));

if (TARGET_64BIT)
{
rtx op0 = gen_reg_rtx (DImode);
emit_insn (gen_madd_split_fused_extended (op0, tmp0, tmp1, operands[3]));
op0 = gen_lowpart (SImode, op0);
SUBREG_PROMOTED_VAR_P (op0) = 1;
SUBREG_PROMOTED_SET (op0, SRP_SIGNED);
emit_move_insn (operands[0], op0);
}
else
{
emit_insn (gen_madd_split_fused (operands[0], tmp0, tmp1, operands[3]));
}

DONE;
}
}
)

(define_expand "umaddhisi4"
[(set (match_operand:SI 0 "register_operand")
(plus:SI
(mult:SI (zero_extend:SI (match_operand:HI 1 "register_operand"))
(zero_extend:SI (match_operand:HI 2 "register_operand")))
(match_operand:SI 3 "register_operand")))]
"TARGET_ARCV_RHX100
&& !TARGET_64BIT && (TARGET_ZMMUL || TARGET_MUL)"
{
rtx tmp0 = gen_reg_rtx (SImode), tmp1 = gen_reg_rtx (SImode);
emit_insn (gen_zero_extendhisi2 (tmp0, operands[1]));
emit_insn (gen_zero_extendhisi2 (tmp1, operands[2]));

if (TARGET_64BIT)
{
rtx op0 = gen_reg_rtx (DImode);
emit_insn (gen_madd_split_fused_extended (op0, tmp0, tmp1, operands[3]));
op0 = gen_lowpart (SImode, op0);
SUBREG_PROMOTED_VAR_P (op0) = 1;
SUBREG_PROMOTED_SET (op0, SRP_SIGNED);
emit_move_insn (operands[0], op0);
}
else
{
emit_insn (gen_madd_split_fused (operands[0], tmp0, tmp1, operands[3]));
}

DONE;
}
)

(define_expand "msubhisi4"
Expand All @@ -4536,6 +4595,68 @@
"TARGET_XTHEADMAC"
)

(define_insn "madd_split_fused"
[(set (match_operand:SI 0 "register_operand" "=&r,r")
(plus:SI
(mult:SI (match_operand:SI 1 "register_operand" "r,r")
(match_operand:SI 2 "register_operand" "r,r"))
(match_operand:SI 3 "register_operand" "r,?0")))
(clobber (match_scratch:SI 4 "=&r,&r"))]
"TARGET_ARCV_RHX100
&& !TARGET_64BIT && (TARGET_ZMMUL || TARGET_MUL)"
{
if (REGNO (operands[0]) == REGNO (operands[3]))
{
return "mul\t%4,%1,%2\n\tadd\t%4,%3,%4\n\tmv\t%0,%4";
}
else
{
return "mul\t%0,%1,%2\n\tadd\t%0,%0,%3";
}
}
[(set_attr "type" "imul_fused")]
)

(define_insn "madd_split_fused_extended"
[(set (match_operand:DI 0 "register_operand" "=&r,r")
(sign_extend:DI
(plus:SI
(mult:SI (match_operand:SI 1 "register_operand" "r,r")
(match_operand:SI 2 "register_operand" "r,r"))
(match_operand:SI 3 "register_operand" "r,?0"))))
(clobber (match_scratch:SI 4 "=&r,&r"))]
"TARGET_ARCV_RHX100
&& (TARGET_ZMMUL || TARGET_MUL)"
{
if (REGNO (operands[0]) == REGNO (operands[3]))
{
return "mulw\t%4,%1,%2\n\taddw\t%4,%3,%4\n\tmv\t%0,%4";
}
else
{
return "mulw\t%0,%1,%2\n\taddw\t%0,%0,%3";
}
}
[(set_attr "type" "imul_fused")]
)

(define_insn "*zero_extract_fused"
[(set (match_operand:SI 0 "register_operand" "=r")
(zero_extract:SI (match_operand:SI 1 "register_operand" "r")
(match_operand 2 "const_int_operand")
(match_operand 3 "const_int_operand")))]
"TARGET_ARCV_RHX100 && !TARGET_64BIT
&& (INTVAL (operands[2]) > 1 || !TARGET_ZBS)"
{
int amount = INTVAL (operands[2]);
int end = INTVAL (operands[3]) + amount;
operands[2] = GEN_INT (BITS_PER_WORD - end);
operands[3] = GEN_INT (BITS_PER_WORD - amount);
return "slli\t%0,%1,%2\n\tsrli\t%0,%0,%3";
}
[(set_attr "type" "alu_fused")]
)
Comment on lines +4643 to +4658
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I can tell, this fusion was never implemented as a define_insn_and_split. Might not be trivial to force these exact instructions after a split.

Comment on lines +4598 to +4658
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All of these patterns used to be a defined_insn_and_split. In case we can't get rid of them, let's try to go back to define_insn_and_split (check the downstream commit history) and see what the performance looks like.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we revert madd_split_fused1 back to a define_insn_and_split we get 0.898% improvement.


;; String compare with length insn.
;; Argument 0 is the target (result)
;; Argument 1 is the source1
Expand Down
12 changes: 12 additions & 0 deletions gcc/testsuite/gcc.target/riscv/arcv-fusion-limm-condbr.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
/* { dg-do compile } */
/* { dg-options "-O2 -mtune=arc-v-rhx-100-series" } */

int
f (int x)
{
begin:
if (x <= 3)
goto begin;
}

/* { dg-final { scan-assembler "\\sli\\sa5,3\n\\sble\\sa0,a5,.L\[0-9\]+\n" } } */
12 changes: 12 additions & 0 deletions gcc/testsuite/gcc.target/riscv/arcv-fusion-madd.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
/* { dg-do compile } */
/* { dg-require-effective-target rv32 } */
/* { dg-skip-if "" { *-*-* } { "-g" "-flto" "-O0" } } */
/* { dg-options "-mtune=arc-v-rhx-100-series -march=rv32im -mabi=ilp32" } */

int
f (int x, int y, int z, int v, int w)
{
return x + y * z + v * w;
}

/* { dg-final { scan-assembler {\smul\s([ast][0-9]+),a1,a2\n\sadd\s\1,\1,a0\n\smul\sa0,a3,a4\n\sadd\sa0,a0,\1\n} } } */
14 changes: 14 additions & 0 deletions gcc/testsuite/gcc.target/riscv/arcv-fusion-xbfu.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
/* { dg-do compile } */
/* { dg-require-effective-target rv32 } */
/* { dg-skip-if "" { *-*-* } { "-g" "-flto" "-O0" "-Oz" "-Os" } } */
/* { dg-options "-mtune=arc-v-rhx-100-series -march=rv32im_zbs -mabi=ilp32 -dp" } */

#define bit_extract(x,start,amt) (((x)>>(start)) & (~(0xffffffff << (amt))))

int
f (int x)
{
return bit_extract(x,10,14) + bit_extract(x,1,1);
}

/* { dg-final { scan-assembler {\sslli\s([ast][0-9]+),a0,8.*zero_extract_fused\n\ssrli\s([ast][0-9]+),\1,18\n\sbexti\sa0,a0,1.*\n\sadd\sa0,\2,a0.*\n} } } */