[x86] increment/decrement constant vector with min/max in vsetcc lowering (PR39859)

This is part of fixing PR39859:
https://bugs.llvm.org/show_bug.cgi?id=39859

We have a crippled vector ISA, so we have to invert a typical fold and create min/max here.

As discussed in the bug report, we can probably do better by using saturating subtract when 
it's available, but we should have this improvement for the min/max patterns regardless.

Alive proofs:
https://rise4fun.com/Alive/zsf
https://rise4fun.com/Alive/Qrl

Differential Revision: https://reviews.llvm.org/D55515


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@349304 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index ed6700f..704b62d 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -19383,13 +19383,26 @@
   bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
                    !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
 
-  // Special case: Use min/max operations for unsigned compares. We only want
-  // to do this for unsigned compares if we need to flip signs or if it allows
-  // use to avoid an invert.
+  // Special case: Use min/max operations for unsigned compares.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (ISD::isUnsignedIntSetCC(Cond) &&
       (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
       TLI.isOperationLegal(ISD::UMIN, VT)) {
+    // If we have a constant operand, increment/decrement it and change the
+    // condition to avoid an invert.
+    // TODO: This could be extended to handle a non-splat constant by checking
+    // that each element of the constant is not the max/null value.
+    APInt C;
+    if (Cond == ISD::SETUGT && isConstantSplat(Op1, C) && !C.isMaxValue()) {
+      // X > C --> X >= (C+1) --> X == umax(X, C+1)
+      Op1 = DAG.getConstant(C + 1, dl, VT);
+      Cond = ISD::SETUGE;
+    }
+    if (Cond == ISD::SETULT && isConstantSplat(Op1, C) && !C.isNullValue()) {
+      // X < C --> X <= (C-1) --> X == umin(X, C-1)
+      Op1 = DAG.getConstant(C - 1, dl, VT);
+      Cond = ISD::SETULE;
+    }
     bool Invert = false;
     unsigned Opc;
     switch (Cond) {
diff --git a/test/CodeGen/X86/sat-add.ll b/test/CodeGen/X86/sat-add.ll
index 35292d7..e09c241 100644
--- a/test/CodeGen/X86/sat-add.ll
+++ b/test/CodeGen/X86/sat-add.ll
@@ -526,11 +526,9 @@
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [42,42,42,42]
 ; SSE41-NEXT:    paddd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4294967253,4294967253,4294967253,4294967253]
-; SSE41-NEXT:    pminud %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4294967254,4294967254,4294967254,4294967254]
+; SSE41-NEXT:    pmaxud %xmm0, %xmm2
 ; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT:    pxor %xmm2, %xmm0
 ; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    retq
   %a = add <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42>
diff --git a/test/CodeGen/X86/vec_minmax_match.ll b/test/CodeGen/X86/vec_minmax_match.ll
index c3652f3..4d6bb79 100644
--- a/test/CodeGen/X86/vec_minmax_match.ll
+++ b/test/CodeGen/X86/vec_minmax_match.ll
@@ -223,12 +223,11 @@
 ; CHECK-LABEL: wrong_pred_for_smin_with_not:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm2
-; CHECK-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
-; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [4294967291,4294967291,4294967291,4294967291]
-; CHECK-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm1
+; CHECK-NEXT:    vpmaxud {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [4294967291,4294967291,4294967291,4294967291]
+; CHECK-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %not_x = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
   %cmp = icmp ugt <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
diff --git a/test/CodeGen/X86/vec_setcc-2.ll b/test/CodeGen/X86/vec_setcc-2.ll
index 4c22606..946c9fc 100644
--- a/test/CodeGen/X86/vec_setcc-2.ll
+++ b/test/CodeGen/X86/vec_setcc-2.ll
@@ -32,17 +32,15 @@
 ; SSE41-NEXT:    je LBB0_3
 ; SSE41-NEXT:  ## %bb.1: ## %for.body.preheader
 ; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [26,26,26,26,26,26,26,26]
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [25,25,25,25,25,25,25,25]
 ; SSE41-NEXT:    .p2align 4, 0x90
 ; SSE41-NEXT:  LBB0_2: ## %for.body
 ; SSE41-NEXT:    ## =>This Inner Loop Header: Depth=1
-; SSE41-NEXT:    movdqa (%rdi,%rax), %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    pmaxuw %xmm0, %xmm3
-; SSE41-NEXT:    pcmpeqw %xmm2, %xmm3
-; SSE41-NEXT:    pxor %xmm1, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, (%rsi,%rax)
+; SSE41-NEXT:    movdqa (%rdi,%rax), %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pminuw %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqw %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, (%rsi,%rax)
 ; SSE41-NEXT:    addq $16, %rax
 ; SSE41-NEXT:    decl %edx
 ; SSE41-NEXT:    jne LBB0_2
@@ -146,11 +144,9 @@
 define <16 x i8> @test_ult_byte(<16 x i8> %a) {
 ; CHECK-LABEL: test_ult_byte:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
-; CHECK-NEXT:    pmaxub %xmm0, %xmm1
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; CHECK-NEXT:    pminub %xmm0, %xmm1
 ; CHECK-NEXT:    pcmpeqb %xmm1, %xmm0
-; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
-; CHECK-NEXT:    pxor %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %icmp = icmp ult <16 x i8> %a, <i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11>
@@ -187,11 +183,9 @@
 define <16 x i1> @ugt_v16i8_splat(<16 x i8> %x) {
 ; CHECK-LABEL: ugt_v16i8_splat:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
-; CHECK-NEXT:    pminub %xmm0, %xmm1
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [43,43,43,43,43,43,43,43,43,43,43,43,43,43,43,43]
+; CHECK-NEXT:    pmaxub %xmm0, %xmm1
 ; CHECK-NEXT:    pcmpeqb %xmm1, %xmm0
-; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
-; CHECK-NEXT:    pxor %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %cmp = icmp ugt <16 x i8> %x, <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
   ret <16 x i1> %cmp
@@ -206,11 +200,9 @@
 ;
 ; SSE41-LABEL: ugt_v8i16_splat:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [242,242,242,242,242,242,242,242]
-; SSE41-NEXT:    pminuw %xmm0, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [243,243,243,243,243,243,243,243]
+; SSE41-NEXT:    pmaxuw %xmm0, %xmm1
 ; SSE41-NEXT:    pcmpeqw %xmm1, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm1, %xmm0
 ; SSE41-NEXT:    retq
   %cmp = icmp ugt <8 x i16> %x, <i16 242, i16 242, i16 242, i16 242, i16 242, i16 242, i16 242, i16 242>
   ret <8 x i1> %cmp
@@ -225,11 +217,9 @@
 ;
 ; SSE41-LABEL: ugt_v4i32_splat:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4294967254,4294967254,4294967254,4294967254]
-; SSE41-NEXT:    pminud %xmm0, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4294967255,4294967255,4294967255,4294967255]
+; SSE41-NEXT:    pmaxud %xmm0, %xmm1
 ; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm1, %xmm0
 ; SSE41-NEXT:    retq
   %cmp = icmp ugt <4 x i32> %x, <i32 -42, i32 -42, i32 -42, i32 -42>
   ret <4 x i1> %cmp
@@ -341,11 +331,9 @@
 define <16 x i1> @ult_v16i8_splat(<16 x i8> %x) {
 ; CHECK-LABEL: ult_v16i8_splat:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
-; CHECK-NEXT:    pmaxub %xmm0, %xmm1
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41]
+; CHECK-NEXT:    pminub %xmm0, %xmm1
 ; CHECK-NEXT:    pcmpeqb %xmm1, %xmm0
-; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
-; CHECK-NEXT:    pxor %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %cmp = icmp ult <16 x i8> %x, <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
   ret <16 x i1> %cmp
@@ -361,11 +349,9 @@
 ;
 ; SSE41-LABEL: ult_v8i16_splat:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [242,242,242,242,242,242,242,242]
-; SSE41-NEXT:    pmaxuw %xmm0, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [241,241,241,241,241,241,241,241]
+; SSE41-NEXT:    pminuw %xmm0, %xmm1
 ; SSE41-NEXT:    pcmpeqw %xmm1, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm1, %xmm0
 ; SSE41-NEXT:    retq
   %cmp = icmp ult <8 x i16> %x, <i16 242, i16 242, i16 242, i16 242, i16 242, i16 242, i16 242, i16 242>
   ret <8 x i1> %cmp
@@ -382,11 +368,9 @@
 ;
 ; SSE41-LABEL: ult_v4i32_splat:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4294967254,4294967254,4294967254,4294967254]
-; SSE41-NEXT:    pmaxud %xmm0, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4294967253,4294967253,4294967253,4294967253]
+; SSE41-NEXT:    pminud %xmm0, %xmm1
 ; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm1, %xmm0
 ; SSE41-NEXT:    retq
   %cmp = icmp ult <4 x i32> %x, <i32 -42, i32 -42, i32 -42, i32 -42>
   ret <4 x i1> %cmp
@@ -494,6 +478,30 @@
   ret <2 x i1> %cmp
 }
 
+; This should be simplified before we reach lowering, but
+; make sure that we are not getting it wrong by underflowing.
+
+define <4 x i1> @ult_v4i32_splat_0_simplify(<4 x i32> %x) {
+; CHECK-LABEL: ult_v4i32_splat_0_simplify:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %cmp = icmp ult <4 x i32> %x, <i32 0, i32 0, i32 0, i32 0>
+  ret <4 x i1> %cmp
+}
+
+; This should be simplified before we reach lowering, but
+; make sure that we are not getting it wrong by overflowing.
+
+define <4 x i1> @ugt_v4i32_splat_maxval_simplify(<4 x i32> %x) {
+; CHECK-LABEL: ugt_v4i32_splat_maxval_simplify:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %cmp = icmp ugt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i1> %cmp
+}
+
 define <4 x i1> @ugt_v4i32_nonsplat(<4 x i32> %x) {
 ; SSE2-LABEL: ugt_v4i32_nonsplat:
 ; SSE2:       ## %bb.0:
@@ -524,11 +532,9 @@
 ;
 ; SSE41-LABEL: ugt_v4i32_splat_commute:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4,4,4,4]
-; SSE41-NEXT:    pmaxud %xmm0, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [3,3,3,3]
+; SSE41-NEXT:    pminud %xmm0, %xmm1
 ; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm1, %xmm0
 ; SSE41-NEXT:    retq
   %cmp = icmp ugt <4 x i32> <i32 4, i32 4, i32 4, i32 4>, %x
   ret <4 x i1> %cmp
@@ -549,11 +555,9 @@
 ; SSE41-LABEL: PR39859:
 ; SSE41:       ## %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [42,42,42,42,42,42,42,42]
-; SSE41-NEXT:    pminuw %xmm0, %xmm3
-; SSE41-NEXT:    pcmpeqw %xmm0, %xmm3
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE41-NEXT:    pxor %xmm3, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [43,43,43,43,43,43,43,43]
+; SSE41-NEXT:    pmaxuw %xmm2, %xmm0
+; SSE41-NEXT:    pcmpeqw %xmm2, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq