-
Notifications
You must be signed in to change notification settings - Fork 13.2k
[DAG] isSplatValue - only treat binop splats shared undef elements as undef #135597
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…ame demanded undef elements llvm#134602 demonstrated an issue where an AND node always had at least one demanded UNDEF element in either operand, and incorrectly reported this an all-undef result - despite the other element being 0 (so correctly fold to 0). This fix only recognises binops splats if they share the same demanded undef elements. Cause a regression in some v2i32 funnel shift patterns (which promote to v4i32 with undefs in the upper elements), which I will address in a later patch. Fixes llvm#134602
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-llvm-selectiondag Author: Simon Pilgrim (RKSimon) Changes#134602 demonstrated an issue where an AND node always had at least one demanded UNDEF element in either operand, and incorrectly reported this an all-undef result - despite the other element being 0 (so correctly fold to 0). This fix only recognises binops splats if they share the same demanded undef elements. Cause a regression in some v2i32 funnel shift patterns (which promote to v4i32 with undefs in the upper elements), which I will address in a later patch. Fixes #134602 CC @mark-sed Patch is 41.65 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135597.diff 6 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index d6dcb3f15ae7c..bc2ab4f8a97a0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3002,8 +3002,12 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
APInt UndefLHS, UndefRHS;
SDValue LHS = V.getOperand(0);
SDValue RHS = V.getOperand(1);
+ // Ensure the same demanded undef elts for both operands, otherwise we might
+ // fail to handle binop-specific undef handling.
+ // e.g. (and undef, 0) -> 0 etc.
if (isSplatValue(LHS, DemandedElts, UndefLHS, Depth + 1) &&
- isSplatValue(RHS, DemandedElts, UndefRHS, Depth + 1)) {
+ isSplatValue(RHS, DemandedElts, UndefRHS, Depth + 1) &&
+ (DemandedElts & UndefLHS) == (DemandedElts & UndefRHS)) {
UndefElts = UndefLHS | UndefRHS;
return true;
}
diff --git a/llvm/test/CodeGen/X86/pr134602.ll b/llvm/test/CodeGen/X86/pr134602.ll
index e4376cbeab10f..063b6f31fe630 100644
--- a/llvm/test/CodeGen/X86/pr134602.ll
+++ b/llvm/test/CodeGen/X86/pr134602.ll
@@ -2,7 +2,7 @@
; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefix=X86
; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=X64
-; FIXME: incorrect vector codegen due to bad handling of splats of binops containing undefs
+; Test for incorrect vector codegen due to bad handling of splats of binops containing undefs
define i32 @PR134602(i16 %a0) {
; X86-LABEL: PR134602:
; X86: # %bb.0:
@@ -14,7 +14,16 @@ define i32 @PR134602(i16 %a0) {
;
; X64-LABEL: PR134602:
; X64: # %bb.0:
-; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: movzwl %di, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-NEXT: paddw %xmm0, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: psrld $16, %xmm0
+; X64-NEXT: paddw %xmm1, %xmm0
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: cwtl
; X64-NEXT: retq
%splat= insertelement <4 x i16> zeroinitializer, i16 %a0, i64 0
%mul = mul <4 x i16> %splat, <i16 1, i16 1, i16 0, i16 0>
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
index 8523cb4973827..9ecc6296a844a 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
@@ -162,42 +162,72 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind {
define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind {
; SSE2-LABEL: splatvar_funnnel_v2i32:
; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE2-NEXT: pslld $23, %xmm1
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
-; SSE2-NEXT: psllq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE2-NEXT: psllq %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
+; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_funnnel_v2i32:
; SSE41: # %bb.0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE41-NEXT: pslld $23, %xmm1
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
-; SSE41-NEXT: psllq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE41-NEXT: psllq %xmm1, %xmm0
-; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
+; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE41-NEXT: pmuludq %xmm2, %xmm3
+; SSE41-NEXT: pmuludq %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_funnnel_v2i32:
; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
-; AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
+; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_funnnel_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
-; AVX2-NEXT: vpsllq %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
+; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
+; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
+; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX2-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v2i32:
@@ -259,12 +289,22 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind
;
; X86-SSE2-LABEL: splatvar_funnnel_v2i32:
; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; X86-SSE2-NEXT: pslld $23, %xmm1
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
-; X86-SSE2-NEXT: psllq %xmm1, %xmm2
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; X86-SSE2-NEXT: psllq %xmm1, %xmm0
-; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
+; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-SSE2-NEXT: por %xmm3, %xmm0
; X86-SSE2-NEXT: retl
%splat = shufflevector <2 x i32> %amt, <2 x i32> undef, <2 x i32> zeroinitializer
%res = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %x, <2 x i32> %splat)
diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
index eb4d84b8d7dd6..322ebe22671e6 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
@@ -248,27 +248,162 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt)
;
define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) nounwind {
-; SSE-LABEL: splatvar_funnnel_v2i32:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE-NEXT: psllq %xmm2, %xmm3
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: psllq %xmm2, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
-; SSE-NEXT: movaps %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: splatvar_funnnel_v2i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: pandn %xmm4, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: psrld $1, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrld %xmm2, %xmm6
+; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrld %xmm7, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psrld %xmm6, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm5, %xmm1
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,3]
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: pslld $23, %xmm3
+; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: cvttps2dq %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: retq
;
-; AVX-LABEL: splatvar_funnnel_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpsllq %xmm2, %xmm3, %xmm3
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
-; AVX-NEXT: retq
+; SSE41-LABEL: splatvar_funnnel_v2i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE41-NEXT: pmovsxbd {{.*#+}} xmm3 = [31,31,31,31]
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pandn %xmm3, %xmm4
+; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: psrld $1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrld %xmm5, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm8
+; SSE41-NEXT: psrld %xmm7, %xmm8
+; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrld %xmm4, %xmm6
+; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: psrld %xmm4, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7]
+; SSE41-NEXT: pand %xmm3, %xmm2
+; SSE41-NEXT: pslld $23, %xmm2
+; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT: cvttps2dq %xmm2, %xmm1
+; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: splatvar_funnnel_v2i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
+; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6
+; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_funnnel_v2i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX2-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v2i32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX512F-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX512F-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v2i32:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX512VL-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v2i32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX512BW-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v2i32:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX512VLBW-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512VLBW-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512VLBW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v2i32:
; AVX512VBMI2: # %bb.0:
@@ -286,26 +421,67 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %
; AVX512VLVBMI2-NEXT: vpshldvd %xmm2, %xmm1, %xmm0
; AVX512VLVBMI2-NEXT: retq
;
-; XOP-LABEL: splatvar_funnnel_v2i32:
-; XOP: # %bb.0:
-; XOP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; XOP-NEXT: vpsllq %xmm2, %xmm3, %xmm3
-; XOP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0
-; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
-; XOP-NEXT: retq
+; XOPAVX1-LABEL: splatvar_funnnel_v2i32:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v2i32:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastd %xmm2, %xmm3
+; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; XOPAVX2-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; XOPAVX2-NEXT: vpsrld $1, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; XOPAVX2-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: retq
;
; X86-SSE2-LABEL: splatvar_funnnel_v2i32:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
-; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT: psllq %xmm2, %xmm3
-; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE2-NEXT: psllq %xmm2, %xmm1
-; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
-; X86-SSE2-NEXT: movaps %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: pandn %xmm4, %xmm5
+; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7]
+; X86-SSE2-NEXT: psrld $1, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: psrld %xmm2, %xmm6
+; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: psrld %xmm7, %xmm2
+; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
+; X86-SSE2-NEXT: psrld %xmm6, %xmm7
+; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; X86-SSE2-NEXT: psrld %xmm5, %xmm1
+; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,3]
+; X86-SSE2-NEXT: pand %xmm4, %xmm3
+; X86-...
[truncated]
|
if (isSplatValue(LHS, DemandedElts, UndefLHS, Depth + 1) && | ||
isSplatValue(RHS, DemandedElts, UndefRHS, Depth + 1)) { | ||
isSplatValue(RHS, DemandedElts, UndefRHS, Depth + 1) && | ||
(DemandedElts & UndefLHS) == (DemandedElts & UndefRHS)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm, wouldn't it be sufficient to set UndefElts = UndefLHS & UndefRHS
, but still have isSplatValue() return true?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I was just being over-cautious - I'll update it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
#134602 demonstrated an issue where an AND node always had at least one demanded UNDEF element in either operand, and incorrectly reported this an all-undef result - despite the other element being 0 (so correctly fold to 0).
This fix only assumes a binops splats element is undefined if both operands are undef.
Fixes #134602
CC @mark-sed