[X86] Add patterns for rotr by immediate to fix PR41057.

Prior to the introduction of funnel shift intrinsics we could count on rotate
by immediates prefering to use rotl since that's what MatchRotate would check
first. The or+shift pattern doesn't have a direction so one must be chosen
arbitrarily.

With funnel shift, there is a direction and fshr will try to use rotr first.
While fshl will try to use rotl first.

This patch adds the isel patterns for rotr to complement the rotl patterns. I've
put the rotr by 1 patterns in the instruction patterns. And moved the rotl by
bitwidth-1 patterns to separate Pat patterns.

Fixes PR41057.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@356121 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td
index 0853056..c808803 100644
--- a/lib/Target/X86/X86InstrShiftRotate.td
+++ b/lib/Target/X86/X86InstrShiftRotate.td
@@ -585,16 +585,16 @@
 // Rotate by 1
 def ROR8r1   : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
                  "ror{b}\t$dst",
-                 [(set GR8:$dst, (rotl GR8:$src1, (i8 7)))]>;
+                 [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))]>;
 def ROR16r1  : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
                  "ror{w}\t$dst",
-                 [(set GR16:$dst, (rotl GR16:$src1, (i8 15)))]>, OpSize16;
+                 [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))]>, OpSize16;
 def ROR32r1  : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
                  "ror{l}\t$dst",
-                 [(set GR32:$dst, (rotl GR32:$src1, (i8 31)))]>, OpSize32;
+                 [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))]>, OpSize32;
 def ROR64r1  : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
                   "ror{q}\t$dst",
-                  [(set GR64:$dst, (rotl GR64:$src1, (i8 63)))]>;
+                  [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))]>;
 } // Constraints = "$src = $dst", SchedRW
 
 let Uses = [CL], SchedRW = [WriteRotateCLLd, WriteRMW] in {
@@ -633,18 +633,18 @@
 // Rotate by 1
 def ROR8m1   : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
                  "ror{b}\t$dst",
-                 [(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst)]>;
+                 [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
 def ROR16m1  : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
                  "ror{w}\t$dst",
-                 [(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst)]>,
+                 [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
                  OpSize16;
 def ROR32m1  : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
                  "ror{l}\t$dst",
-                 [(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst)]>,
+                 [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>,
                  OpSize32;
 def ROR64m1  : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
                  "ror{q}\t$dst",
-                 [(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst)]>,
+                 [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>,
                  Requires<[In64BitMode]>;
 } // SchedRW
 
@@ -806,6 +806,34 @@
 
 } // Defs = [EFLAGS]
 
+// Use the opposite rotate if allows us to use the rotate by 1 instruction.
+def : Pat<(rotl GR8:$src1,  (i8 7)),  (ROR8r1  GR8:$src1)>;
+def : Pat<(rotl GR16:$src1, (i8 15)), (ROR16r1 GR16:$src1)>;
+def : Pat<(rotl GR32:$src1, (i8 31)), (ROR32r1 GR32:$src1)>;
+def : Pat<(rotl GR64:$src1, (i8 63)), (ROR64r1 GR64:$src1)>;
+def : Pat<(rotr GR8:$src1,  (i8 7)),  (ROL8r1  GR8:$src1)>;
+def : Pat<(rotr GR16:$src1, (i8 15)), (ROL16r1 GR16:$src1)>;
+def : Pat<(rotr GR32:$src1, (i8 31)), (ROL32r1 GR32:$src1)>;
+def : Pat<(rotr GR64:$src1, (i8 63)), (ROL64r1 GR64:$src1)>;
+
+def : Pat<(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst),
+          (ROR8m1 addr:$dst)>;
+def : Pat<(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst),
+          (ROR16m1 addr:$dst)>;
+def : Pat<(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst),
+          (ROR32m1 addr:$dst)>;
+def : Pat<(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst),
+          (ROR64m1 addr:$dst)>, Requires<[In64BitMode]>;
+
+def : Pat<(store (rotr (loadi8 addr:$dst), (i8 7)), addr:$dst),
+          (ROL8m1 addr:$dst)>;
+def : Pat<(store (rotr (loadi16 addr:$dst), (i8 15)), addr:$dst),
+          (ROL16m1 addr:$dst)>;
+def : Pat<(store (rotr (loadi32 addr:$dst), (i8 31)), addr:$dst),
+          (ROL32m1 addr:$dst)>;
+def : Pat<(store (rotr (loadi64 addr:$dst), (i8 63)), addr:$dst),
+          (ROL64m1 addr:$dst)>, Requires<[In64BitMode]>;
+
 // Sandy Bridge and newer Intel processors support faster rotates using
 // SHLD to avoid a partial flag update on the normal rotate instructions.
 let Predicates = [HasFastSHLDRotate], AddedComplexity = 5 in {
@@ -813,6 +841,11 @@
             (SHLD32rri8 GR32:$src, GR32:$src, imm:$shamt)>;
   def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
             (SHLD64rri8 GR64:$src, GR64:$src, imm:$shamt)>;
+
+  def : Pat<(rotr GR32:$src, (i8 imm:$shamt)),
+            (SHRD32rri8 GR32:$src, GR32:$src, imm:$shamt)>;
+  def : Pat<(rotr GR64:$src, (i8 imm:$shamt)),
+            (SHRD64rri8 GR64:$src, GR64:$src, imm:$shamt)>;
 }
 
 def ROT32L2R_imm8  : SDNodeXForm<imm, [{
@@ -870,19 +903,29 @@
 
   // Prefer RORX which is non-destructive and doesn't update EFLAGS.
   let AddedComplexity = 10 in {
+    def : Pat<(rotr GR32:$src, (i8 imm:$shamt)),
+              (RORX32ri GR32:$src, imm:$shamt)>;
+    def : Pat<(rotr GR64:$src, (i8 imm:$shamt)),
+              (RORX64ri GR64:$src, imm:$shamt)>;
+
     def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
               (RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>;
     def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
               (RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>;
   }
 
+  def : Pat<(rotr (loadi32 addr:$src), (i8 imm:$shamt)),
+            (RORX32mi addr:$src, imm:$shamt)>;
+  def : Pat<(rotr (loadi64 addr:$src), (i8 imm:$shamt)),
+            (RORX64mi addr:$src, imm:$shamt)>;
+
   def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)),
             (RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>;
   def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)),
             (RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>;
 
   // Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not
-  // immedidate shift, i.e. the following code is considered better
+  // immediate shift, i.e. the following code is considered better
   //
   //  mov %edi, %esi
   //  shl $imm, %esi
diff --git a/test/CodeGen/X86/funnel-shift-rot.ll b/test/CodeGen/X86/funnel-shift-rot.ll
index a66d771..29e1b3e 100644
--- a/test/CodeGen/X86/funnel-shift-rot.ll
+++ b/test/CodeGen/X86/funnel-shift-rot.ll
@@ -222,13 +222,13 @@
 ; X32-SSE2-LABEL: rotr_i8_const_shift7:
 ; X32-SSE2:       # %bb.0:
 ; X32-SSE2-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X32-SSE2-NEXT:    rorb $7, %al
+; X32-SSE2-NEXT:    rolb %al
 ; X32-SSE2-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: rotr_i8_const_shift7:
 ; X64-AVX2:       # %bb.0:
 ; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    rorb $7, %al
+; X64-AVX2-NEXT:    rolb %al
 ; X64-AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-AVX2-NEXT:    retq
   %f = call i8 @llvm.fshr.i8(i8 %x, i8 %x, i8 7)
diff --git a/test/CodeGen/X86/rot32.ll b/test/CodeGen/X86/rot32.ll
index c7ad8f6..d6c5ae2 100644
--- a/test/CodeGen/X86/rot32.ll
+++ b/test/CodeGen/X86/rot32.ll
@@ -472,67 +472,157 @@
 }
 
 define i32 @fshr(i32 %x) nounwind {
-; CHECK32-LABEL: fshr:
-; CHECK32:       # %bb.0:
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    rorl $7, %eax
-; CHECK32-NEXT:    retl
+; X86-LABEL: fshr:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    rorl $7, %eax
+; X86-NEXT:    retl
 ;
-; CHECK64-LABEL: fshr:
-; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    movl %edi, %eax
-; CHECK64-NEXT:    rorl $7, %eax
-; CHECK64-NEXT:    retq
+; SHLD-LABEL: fshr:
+; SHLD:       # %bb.0:
+; SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SHLD-NEXT:    shrdl $7, %eax, %eax
+; SHLD-NEXT:    retl
+;
+; BMI2-LABEL: fshr:
+; BMI2:       # %bb.0:
+; BMI2-NEXT:    rorxl $7, {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT:    retl
+;
+; X64-LABEL: fshr:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    rorl $7, %eax
+; X64-NEXT:    retq
+;
+; SHLD64-LABEL: fshr:
+; SHLD64:       # %bb.0:
+; SHLD64-NEXT:    movl %edi, %eax
+; SHLD64-NEXT:    shrdl $7, %edi, %eax
+; SHLD64-NEXT:    retq
+;
+; BMI264-LABEL: fshr:
+; BMI264:       # %bb.0:
+; BMI264-NEXT:    rorxl $7, %edi, %eax
+; BMI264-NEXT:    retq
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 7)
   ret i32 %f
 }
 declare i32 @llvm.fshr.i32(i32, i32, i32)
 
 define i32 @fshr1(i32 %x) nounwind {
-; CHECK32-LABEL: fshr1:
-; CHECK32:       # %bb.0:
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    rorl $1, %eax
-; CHECK32-NEXT:    retl
+; X86-LABEL: fshr1:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    rorl $1, %eax
+; X86-NEXT:    retl
 ;
-; CHECK64-LABEL: fshr1:
-; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    movl %edi, %eax
-; CHECK64-NEXT:    rorl $1, %eax
-; CHECK64-NEXT:    retq
+; SHLD-LABEL: fshr1:
+; SHLD:       # %bb.0:
+; SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SHLD-NEXT:    shrdl $1, %eax, %eax
+; SHLD-NEXT:    retl
+;
+; BMI2-LABEL: fshr1:
+; BMI2:       # %bb.0:
+; BMI2-NEXT:    rorxl $1, {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT:    retl
+;
+; X64-LABEL: fshr1:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    rorl $1, %eax
+; X64-NEXT:    retq
+;
+; SHLD64-LABEL: fshr1:
+; SHLD64:       # %bb.0:
+; SHLD64-NEXT:    movl %edi, %eax
+; SHLD64-NEXT:    shrdl $1, %edi, %eax
+; SHLD64-NEXT:    retq
+;
+; BMI264-LABEL: fshr1:
+; BMI264:       # %bb.0:
+; BMI264-NEXT:    rorxl $1, %edi, %eax
+; BMI264-NEXT:    retq
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 1)
   ret i32 %f
 }
 
 define i32 @fshr31(i32 %x) nounwind {
-; CHECK32-LABEL: fshr31:
-; CHECK32:       # %bb.0:
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    rorl $31, %eax
-; CHECK32-NEXT:    retl
+; X86-LABEL: fshr31:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    roll %eax
+; X86-NEXT:    retl
 ;
-; CHECK64-LABEL: fshr31:
-; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    movl %edi, %eax
-; CHECK64-NEXT:    rorl $31, %eax
-; CHECK64-NEXT:    retq
+; SHLD-LABEL: fshr31:
+; SHLD:       # %bb.0:
+; SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SHLD-NEXT:    shrdl $31, %eax, %eax
+; SHLD-NEXT:    retl
+;
+; BMI2-LABEL: fshr31:
+; BMI2:       # %bb.0:
+; BMI2-NEXT:    rorxl $31, {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT:    retl
+;
+; X64-LABEL: fshr31:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    roll %eax
+; X64-NEXT:    retq
+;
+; SHLD64-LABEL: fshr31:
+; SHLD64:       # %bb.0:
+; SHLD64-NEXT:    movl %edi, %eax
+; SHLD64-NEXT:    shrdl $31, %edi, %eax
+; SHLD64-NEXT:    retq
+;
+; BMI264-LABEL: fshr31:
+; BMI264:       # %bb.0:
+; BMI264-NEXT:    rorxl $31, %edi, %eax
+; BMI264-NEXT:    retq
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 31)
   ret i32 %f
 }
 
 define i32 @fshr_load(i32* %p) nounwind {
-; CHECK32-LABEL: fshr_load:
-; CHECK32:       # %bb.0:
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    movl (%eax), %eax
-; CHECK32-NEXT:    rorl $7, %eax
-; CHECK32-NEXT:    retl
+; X86-LABEL: fshr_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    rorl $7, %eax
+; X86-NEXT:    retl
 ;
-; CHECK64-LABEL: fshr_load:
-; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    movl (%rdi), %eax
-; CHECK64-NEXT:    rorl $7, %eax
-; CHECK64-NEXT:    retq
+; SHLD-LABEL: fshr_load:
+; SHLD:       # %bb.0:
+; SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SHLD-NEXT:    movl (%eax), %eax
+; SHLD-NEXT:    shrdl $7, %eax, %eax
+; SHLD-NEXT:    retl
+;
+; BMI2-LABEL: fshr_load:
+; BMI2:       # %bb.0:
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT:    rorxl $7, (%eax), %eax
+; BMI2-NEXT:    retl
+;
+; X64-LABEL: fshr_load:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    rorl $7, %eax
+; X64-NEXT:    retq
+;
+; SHLD64-LABEL: fshr_load:
+; SHLD64:       # %bb.0:
+; SHLD64-NEXT:    movl (%rdi), %eax
+; SHLD64-NEXT:    shrdl $7, %eax, %eax
+; SHLD64-NEXT:    retq
+;
+; BMI264-LABEL: fshr_load:
+; BMI264:       # %bb.0:
+; BMI264-NEXT:    rorxl $7, (%rdi), %eax
+; BMI264-NEXT:    retq
   %x = load i32, i32* %p
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 7)
   ret i32 %f
diff --git a/test/CodeGen/X86/rot64.ll b/test/CodeGen/X86/rot64.ll
index 7259f64..43ece70 100644
--- a/test/CodeGen/X86/rot64.ll
+++ b/test/CodeGen/X86/rot64.ll
@@ -278,42 +278,86 @@
 }
 
 define i64 @fshr(i64 %x) nounwind {
-; ALL-LABEL: fshr:
-; ALL:       # %bb.0:
-; ALL-NEXT:    movq %rdi, %rax
-; ALL-NEXT:    rorq $7, %rax
-; ALL-NEXT:    retq
+; X64-LABEL: fshr:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    rorq $7, %rax
+; X64-NEXT:    retq
+;
+; SHLD-LABEL: fshr:
+; SHLD:       # %bb.0:
+; SHLD-NEXT:    movq %rdi, %rax
+; SHLD-NEXT:    shrdq $7, %rdi, %rax
+; SHLD-NEXT:    retq
+;
+; BMI2-LABEL: fshr:
+; BMI2:       # %bb.0:
+; BMI2-NEXT:    rorxq $7, %rdi, %rax
+; BMI2-NEXT:    retq
   %f = call i64 @llvm.fshr.i64(i64 %x, i64 %x, i64 7)
   ret i64 %f
 }
 declare i64 @llvm.fshr.i64(i64, i64, i64)
 
 define i64 @fshr1(i64 %x) nounwind {
-; ALL-LABEL: fshr1:
-; ALL:       # %bb.0:
-; ALL-NEXT:    movq %rdi, %rax
-; ALL-NEXT:    rorq $1, %rax
-; ALL-NEXT:    retq
+; X64-LABEL: fshr1:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    rorq $1, %rax
+; X64-NEXT:    retq
+;
+; SHLD-LABEL: fshr1:
+; SHLD:       # %bb.0:
+; SHLD-NEXT:    movq %rdi, %rax
+; SHLD-NEXT:    shrdq $1, %rdi, %rax
+; SHLD-NEXT:    retq
+;
+; BMI2-LABEL: fshr1:
+; BMI2:       # %bb.0:
+; BMI2-NEXT:    rorxq $1, %rdi, %rax
+; BMI2-NEXT:    retq
   %f = call i64 @llvm.fshr.i64(i64 %x, i64 %x, i64 1)
   ret i64 %f
 }
 
 define i64 @fshr63(i64 %x) nounwind {
-; ALL-LABEL: fshr63:
-; ALL:       # %bb.0:
-; ALL-NEXT:    movq %rdi, %rax
-; ALL-NEXT:    rorq $63, %rax
-; ALL-NEXT:    retq
+; X64-LABEL: fshr63:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    rolq %rax
+; X64-NEXT:    retq
+;
+; SHLD-LABEL: fshr63:
+; SHLD:       # %bb.0:
+; SHLD-NEXT:    movq %rdi, %rax
+; SHLD-NEXT:    shrdq $63, %rdi, %rax
+; SHLD-NEXT:    retq
+;
+; BMI2-LABEL: fshr63:
+; BMI2:       # %bb.0:
+; BMI2-NEXT:    rorxq $63, %rdi, %rax
+; BMI2-NEXT:    retq
   %f = call i64 @llvm.fshr.i64(i64 %x, i64 %x, i64 63)
   ret i64 %f
 }
 
 define i64 @fshr_load(i64* %p) nounwind {
-; ALL-LABEL: fshr_load:
-; ALL:       # %bb.0:
-; ALL-NEXT:    movq (%rdi), %rax
-; ALL-NEXT:    rorq $7, %rax
-; ALL-NEXT:    retq
+; X64-LABEL: fshr_load:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    rorq $7, %rax
+; X64-NEXT:    retq
+;
+; SHLD-LABEL: fshr_load:
+; SHLD:       # %bb.0:
+; SHLD-NEXT:    movq (%rdi), %rax
+; SHLD-NEXT:    shrdq $7, %rax, %rax
+; SHLD-NEXT:    retq
+;
+; BMI2-LABEL: fshr_load:
+; BMI2:       # %bb.0:
+; BMI2-NEXT:    rorxq $7, (%rdi), %rax
+; BMI2-NEXT:    retq
   %x = load i64, i64* %p
   %f = call i64 @llvm.fshr.i64(i64 %x, i64 %x, i64 7)
   ret i64 %f