summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNadav Rotem <nadav.rotem@intel.com>2012-04-24 11:07:03 +0000
committerNadav Rotem <nadav.rotem@intel.com>2012-04-24 11:07:03 +0000
commitd1a79136e33afd7069edb4c73d31af3fafb840a2 (patch)
treebd9371c3cebf9c27e0921744b2a86b977e61f8eb
parentadb082cd6ae2ec799838cd84d54a381bca175cd4 (diff)
AVX: We lower VECTOR_SHUFFLE and BUILD_VECTOR nodes into vbroadcast instructions
using the pattern (vbroadcast (i32load src)). In some cases, after we generate this pattern new users are added to the load node, which prevent the selection of the blend pattern. This commit provides fallback patterns which perform in-vector broadcast (using in-vector vbroadcast in AVX2 and pshufd on AVX1). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@155437 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/X86/X86InstrSSE.td43
-rw-r--r--test/CodeGen/X86/avx2-vbroadcast.ll45
2 files changed, 83 insertions, 5 deletions
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 450d29a8574..8cd00a9aa0e 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -7723,6 +7723,20 @@ let Predicates = [HasAVX2] in {
(VPBROADCASTQrm addr:$src)>;
def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
(VPBROADCASTQYrm addr:$src)>;
+
+ // Provide fallback in case the load node that is used in the patterns above
+ // is used by additional users, which prevents the pattern selection.
+ let AddedComplexity = 20 in {
+ def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+ (VBROADCASTSSrr
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss))>;
+ def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+ (VBROADCASTSSYrr
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss))>;
+ def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+ (VBROADCASTSDrr
+ (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd))>;
+ }
}
// AVX1 broadcast patterns
@@ -7735,11 +7749,38 @@ def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
(VBROADCASTSSYrm addr:$src)>;
def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
(VBROADCASTSDrm addr:$src)>;
-
def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
(VBROADCASTSSrm addr:$src)>;
def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
(VBROADCASTSSrm addr:$src)>;
+
+ // Provide fallback in case the load node that is used in the patterns above
+ // is used by additional users, which prevents the pattern selection.
+ let AddedComplexity = 20 in {
+ // 128bit broadcasts:
+ def : Pat<(v2f64 (X86VBroadcast FR64:$src)),
+ (VPSHUFDri
+ (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd), 0)>;
+ def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+ (VPSHUFDri
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss), 0)>;
+ def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
+ (VPSHUFDri
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss), 0),
+ sub_xmm),
+ (VPSHUFDri
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss),
+ 0), 1)>;
+ def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
+ (VPSHUFDri
+ (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd), 0),
+ sub_xmm),
+ (VPSHUFDri
+ (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd),
+ 0), 1)>;
+ }
}
//===----------------------------------------------------------------------===//
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 1a78414761c..6eba694bd4a 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -160,6 +160,15 @@ entry:
ret <8 x i32> %g
}
+; CHECK: V113
+; CHECK: vbroadcastss
+; CHECK: ret
+define <8 x float> @V113(<8 x float> %in) nounwind uwtable readnone ssp {
+entry:
+ %g = fadd <8 x float> %in, <float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000>
+ ret <8 x float> %g
+}
+
; CHECK: _e2
; CHECK: vbroadcastss
; CHECK: ret
@@ -179,9 +188,37 @@ define <8 x i8> @_e4(i8* %ptr) nounwind uwtable readnone ssp {
%vecinit1.i = insertelement <8 x i8> %vecinit0.i, i8 52, i32 1
%vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 52, i32 2
%vecinit3.i = insertelement <8 x i8> %vecinit2.i, i8 52, i32 3
- %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 52, i32 3
- %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 52, i32 3
- %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 52, i32 3
- %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 52, i32 3
+ %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 52, i32 4
+ %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 52, i32 5
+ %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 52, i32 6
+ %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 52, i32 7
ret <8 x i8> %vecinit7.i
}
+
+
+define void @crash() nounwind alwaysinline {
+WGLoopsEntry:
+ br i1 undef, label %ret, label %footer329VF
+
+footer329VF:
+ %A.0.inVF = fmul float undef, 6.553600e+04
+ %B.0.in407VF = fmul <8 x float> undef, <float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04>
+ %A.0VF = fptosi float %A.0.inVF to i32
+ %B.0408VF = fptosi <8 x float> %B.0.in407VF to <8 x i32>
+ %0 = and <8 x i32> %B.0408VF, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+ %1 = and i32 %A.0VF, 65535
+ %temp1098VF = insertelement <8 x i32> undef, i32 %1, i32 0
+ %vector1099VF = shufflevector <8 x i32> %temp1098VF, <8 x i32> undef, <8 x i32> zeroinitializer
+ br i1 undef, label %preload1201VF, label %footer349VF
+
+preload1201VF:
+ br label %footer349VF
+
+footer349VF:
+ %2 = mul nsw <8 x i32> undef, %0
+ %3 = mul nsw <8 x i32> undef, %vector1099VF
+ br label %footer329VF
+
+ret:
+ ret void
+}