4 files changed, 63 insertions, 22 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 1bdb762a4b7..97103504198 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -4192,6 +4192,16 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) {
   return (Val < 0) || (Val >= Low && Val < Hi);
 }
 
+/// Return true if every element in Mask is undef or if its value
+/// falls within the specified range (L, H].
+static bool isUndefOrInRange(ArrayRef<int> Mask,
+                             int Low, int Hi) {
+  for (int M : Mask)
+    if (!isUndefOrInRange(M, Low, Hi))
+      return false;
+  return true;
+}
+
 /// Val is either less than zero (undef) or equal to the specified value.
 static bool isUndefOrEqual(int Val, int CmpVal) {
   return (Val < 0 || Val == CmpVal);
@@ -24834,12 +24844,47 @@ static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
            "Expected unary shuffle");
   }
 
-  // We only support permutation of 32/64 bit elements.
-  // TODO - support PSHUFLW/PSHUFHW.
   unsigned MaskScalarSizeInBits = SrcVT.getSizeInBits() / Mask.size();
+  MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
+
+  // Handle PSHUFLW/PSHUFHW repeated patterns.
+  if (MaskScalarSizeInBits == 16) {
+    SmallVector<int, 4> RepeatedMask;
+    if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
+      ArrayRef<int> LoMask(Mask.data() + 0, 4);
+      ArrayRef<int> HiMask(Mask.data() + 4, 4);
+
+      // PSHUFLW: permute lower 4 elements only.
+      if (isUndefOrInRange(LoMask, 0, 4) &&
+          isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
+        Shuffle = X86ISD::PSHUFLW;
+        ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
+        PermuteImm = getV4X86ShuffleImm(LoMask);
+        return true;
+      }
+
+      // PSHUFHW: permute upper 4 elements only.
+      if (isUndefOrInRange(HiMask, 4, 8) &&
+          isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
+        // Offset the HiMask so that we can create the shuffle immediate.
+        int OffsetHiMask[4];
+        for (int i = 0; i != 4; ++i)
+          OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
+
+        Shuffle = X86ISD::PSHUFHW;
+        ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
+        PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
+        return true;
+      }
+
+      return false;
+    }
+    return false;
+  }
+
+  // We only support permutation of 32/64 bit elements after this.
   if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
     return false;
-  MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
 
   // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
   // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index 4d164cec3c9..1a77785ab66 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -229,7 +229,7 @@ define <32 x i8> @combine_pshufb_as_psrldq(<32 x i8> %a0) {
 define <32 x i8> @combine_pshufb_as_pshuflw(<32 x i8> %a0) {
 ; CHECK-LABEL: combine_pshufb_as_pshuflw:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,8,9,10,11,12,13,14,15,18,19,16,17,22,23,20,21,24,25,26,27,28,29,30,31]
+; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
 ; CHECK-NEXT:    retq
   %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
   ret <32 x i8> %res0
@@ -238,14 +238,14 @@ define <32 x i8> @combine_pshufb_as_pshuflw(<32 x i8> %a0) {
 define <32 x i8> @combine_pshufb_as_pshufhw(<32 x i8> %a0) {
 ; CHECK-LABEL: combine_pshufb_as_pshufhw:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,10,11,8,9,14,15,12,13,16,17,18,19,20,21,22,23,26,27,24,25,30,31,28,29]
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14]
 ; CHECK-NEXT:    retq
   %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
   ret <32 x i8> %res0
 }
 
-define <32 x i8> @combine_pshufb_as_pshufw(<32 x i8> %a0) {
-; CHECK-LABEL: combine_pshufb_as_pshufw:
+define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) {
+; CHECK-LABEL: combine_pshufb_not_as_pshufw:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
index 0f994abba11..ef492e053eb 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
@@ -476,8 +476,7 @@ define <64 x i8> @combine_pshufb_as_psrldq_mask(<64 x i8> %a0, i64 %m) {
 define <32 x i16> @combine_permvar_as_pshuflw(<32 x i16> %a0) {
 ; CHECK-LABEL: combine_permvar_as_pshuflw:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqu16 {{.*#+}} zmm1 = [1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31]
-; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31]
 ; CHECK-NEXT:    retq
   %res0 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 1, i16 0, i16 3, i16 2, i16 4, i16 5, i16 6, i16 7, i16 9, i16 8, i16 11, i16 10, i16 12, i16 13, i16 14, i16 15, i16 17, i16 16, i16 19, i16 18, i16 20, i16 21, i16 22, i16 23, i16 25, i16 24, i16 27, i16 26, i16 28, i16 29, i16 30, i16 31>, <32 x i16> undef, i32 -1)
   ret <32 x i16> %res0
@@ -486,8 +485,7 @@ define <32 x i16> @combine_permvar_as_pshuflw(<32 x i16> %a0) {
 define <32 x i16> @combine_pshufb_as_pshufhw(<32 x i16> %a0) {
 ; CHECK-LABEL: combine_pshufb_as_pshufhw:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqu16 {{.*#+}} zmm1 = [0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30]
-; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30]
 ; CHECK-NEXT:    retq
   %res0 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 5, i16 4, i16 7, i16 6, i16 8, i16 9, i16 10, i16 11, i16 13, i16 12, i16 15, i16 14, i16 16, i16 17, i16 18, i16 19, i16 21, i16 20, i16 23, i16 22, i16 24, i16 25, i16 26, i16 27, i16 29, i16 28, i16 31, i16 30>, <32 x i16> undef, i32 -1)
   ret <32 x i16> %res0
@@ -496,10 +494,8 @@ define <32 x i16> @combine_pshufb_as_pshufhw(<32 x i16> %a0) {
 define <32 x i16> @combine_pshufb_as_pshufw(<32 x i16> %a0) {
 ; CHECK-LABEL: combine_pshufb_as_pshufw:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqu16 {{.*#+}} zmm1 = [1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31]
-; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    vmovdqu16 {{.*#+}} zmm1 = [0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30]
-; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30]
 ; CHECK-NEXT:    retq
   %res0 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 1, i16 0, i16 3, i16 2, i16 4, i16 5, i16 6, i16 7, i16 9, i16 8, i16 11, i16 10, i16 12, i16 13, i16 14, i16 15, i16 17, i16 16, i16 19, i16 18, i16 20, i16 21, i16 22, i16 23, i16 25, i16 24, i16 27, i16 26, i16 28, i16 29, i16 30, i16 31>, <32 x i16> undef, i32 -1)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %res0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 5, i16 4, i16 7, i16 6, i16 8, i16 9, i16 10, i16 11, i16 13, i16 12, i16 15, i16 14, i16 16, i16 17, i16 18, i16 19, i16 21, i16 20, i16 23, i16 22, i16 24, i16 25, i16 26, i16 27, i16 29, i16 28, i16 31, i16 30>, <32 x i16> undef, i32 -1)
diff --git a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 7aedc0e2700..85e1071a35a 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -167,12 +167,12 @@ define <16 x i8> @combine_pshufb_as_psrldq(<16 x i8> %a0) {
 define <16 x i8> @combine_pshufb_as_pshuflw(<16 x i8> %a0) {
 ; SSE-LABEL: combine_pshufb_as_pshuflw:
 ; SSE:       # BB#0:
-; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,8,9,10,11,12,13,14,15]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_pshufb_as_pshuflw:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,8,9,10,11,12,13,14,15]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
 ; AVX-NEXT:    retq
   %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
   ret <16 x i8> %res0
@@ -181,24 +181,24 @@ define <16 x i8> @combine_pshufb_as_pshuflw(<16 x i8> %a0) {
 define <16 x i8> @combine_pshufb_as_pshufhw(<16 x i8> %a0) {
 ; SSE-LABEL: combine_pshufb_as_pshufhw:
 ; SSE:       # BB#0:
-; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,10,11,8,9,14,15,12,13]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_pshufb_as_pshufhw:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,10,11,8,9,14,15,12,13]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
 ; AVX-NEXT:    retq
   %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
   ret <16 x i8> %res0
 }
 
-define <16 x i8> @combine_pshufb_as_pshufw(<16 x i8> %a0) {
-; SSE-LABEL: combine_pshufb_as_pshufw:
+define <16 x i8> @combine_pshufb_not_as_pshufw(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_not_as_pshufw:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_pshufb_as_pshufw:
+; AVX-LABEL: combine_pshufb_not_as_pshufw:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
 ; AVX-NEXT:    retq