[X86] Directly emit VPTERNLOG from canonicalizeBitSelect when possible.
Seems to produce better results on some rotate tests. And is
neutral for other tests.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0b84635..1146b61 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42782,6 +42782,16 @@
}
SDLoc DL(N);
+
+ if (UseVPTERNLOG) {
+ // Emit a VPTERNLOG node directly.
+ SDValue A = DAG.getBitcast(VT, N0.getOperand(1));
+ SDValue B = DAG.getBitcast(VT, N0.getOperand(0));
+ SDValue C = DAG.getBitcast(VT, N1.getOperand(0));
+ SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
+ return DAG.getNode(X86ISD::VPTERNLOG, DL, VT, A, B, C, Imm);
+ }
+
SDValue X = N->getOperand(0);
SDValue Y =
DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index 1807646..2980a16 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -1786,12 +1786,10 @@
define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" {
; CHECK-LABEL: splatconstant_rotate_mask_v32i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; CHECK-NEXT: vpsllw $4, %ymm0, %ymm2
-; CHECK-NEXT: vpand %ymm1, %ymm2, %ymm2
+; CHECK-NEXT: vpsllw $4, %ymm0, %ymm1
; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0
-; CHECK-NEXT: vpandn %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpternlogq $168, {{.*}}(%rip), %ymm2, %ymm0
+; CHECK-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
+; CHECK-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: retq
%shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
%lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index d27d398..31fe575 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -1802,11 +1802,9 @@
; AVX512VL-LABEL: splatconstant_rotate_mask_v32i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpandn %ymm0, %ymm2, %ymm0
-; AVX512VL-NEXT: vpternlogq $168, {{.*}}(%rip), %ymm1, %ymm0
+; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_mask_v32i8: