[GlobalISel][AArch64] Legalize + select some llvm.ctlz.* intrinsics

Legalize/select llvm.ctlz.*

Add select-ctlz to show that we actually select them. Update arm64-clrsb.ll and
arm64-vclz.ll to show that we perform valid transformations in optimized builds,
and document where GISel can improve.

Differential Revision: https://reviews.llvm.org/D58155

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@354299 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 38f0e22..614a67f 100644
--- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2099,6 +2099,7 @@
   case G_FSIN:
   case G_FSQRT:
   case G_BSWAP:
+  case G_CTLZ:
     return fewerElementsVectorBasic(MI, TypeIdx, NarrowTy);
   case G_SHL:
   case G_LSHR:
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 3c57af8..94a6628 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -457,6 +457,10 @@
       })
       .minScalarSameAs(1, 0);
 
+  getActionDefinitionsBuilder(G_CTLZ).legalForCartesianProduct(
+      {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
+      .scalarize(1);
+
   computeTables();
   verify(*ST.getInstrInfo());
 }
diff --git a/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 3a1562e..623ab52 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -322,7 +322,7 @@
 # DEBUG:      .. type index coverage check SKIPPED: no rules defined
 #
 # DEBUG-NEXT: G_CTLZ (opcode {{[0-9]+}}): 2 type indices
-# DEBUG:      .. type index coverage check SKIPPED: no rules defined
+# DEBUG:      .. the first uncovered type index: 2, OK
 #
 # DEBUG-NEXT: G_CTLZ_ZERO_UNDEF (opcode {{[0-9]+}}): 2 type indices
 # DEBUG:      .. type index coverage check SKIPPED: no rules defined
diff --git a/test/CodeGen/AArch64/GlobalISel/select-ctlz.mir b/test/CodeGen/AArch64/GlobalISel/select-ctlz.mir
new file mode 100644
index 0000000..74e4903
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-ctlz.mir
@@ -0,0 +1,200 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -mtriple=arm64-unknown-unknown -global-isel -run-pass=instruction-select %s -o - | FileCheck %s
+
+name:            test_v8s8
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $d0
+
+    ; CHECK-LABEL: name:            test_v8s8
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
+    ; CHECK: [[CLZv8i8_:%[0-9]+]]:fpr64 = CLZv8i8 [[COPY]]
+    ; CHECK: $d0 = COPY [[CLZv8i8_]]
+    ; CHECK: RET_ReallyLR implicit $d0
+    %0:fpr(<8 x s8>) = COPY $d0
+    %1:fpr(<8 x s8>) = G_CTLZ %0(<8 x s8>)
+    $d0 = COPY %1(<8 x s8>)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            test_v4s16
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: test_v4s16
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
+    ; CHECK: [[CLZv4i16_:%[0-9]+]]:fpr64 = CLZv4i16 [[COPY]]
+    ; CHECK: $d0 = COPY [[CLZv4i16_]]
+    ; CHECK: RET_ReallyLR implicit $d0
+    %0:fpr(<4 x s16>) = COPY $d0
+    %1:fpr(<4 x s16>) = G_CTLZ %0(<4 x s16>)
+    $d0 = COPY %1(<4 x s16>)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            test_v2s32
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: test_v2s32
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
+    ; CHECK: [[CLZv2i32_:%[0-9]+]]:fpr64 = CLZv2i32 [[COPY]]
+    ; CHECK: $d0 = COPY [[CLZv2i32_]]
+    ; CHECK: RET_ReallyLR implicit $d0
+    %0:fpr(<2 x s32>) = COPY $d0
+    %1:fpr(<2 x s32>) = G_CTLZ %0(<2 x s32>)
+    $d0 = COPY %1(<2 x s32>)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            test_s64
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: test_s64
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY [[COPY]]
+    ; CHECK: [[CLZXr:%[0-9]+]]:gpr64 = CLZXr [[COPY1]]
+    ; CHECK: $d0 = COPY [[CLZXr]]
+    ; CHECK: RET_ReallyLR implicit $d0
+    %0:fpr(s64) = COPY $d0
+    %2:gpr(s64) = COPY %0(s64)
+    %1:gpr(s64) = G_CTLZ %2(s64)
+    $d0 = COPY %1(s64)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            test_s32
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $s0
+    ; CHECK-LABEL: name: test_s32
+    ; CHECK: liveins: $s0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr32 = COPY $s0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY]]
+    ; CHECK: [[CLZWr:%[0-9]+]]:gpr32 = CLZWr [[COPY1]]
+    ; CHECK: $s0 = COPY [[CLZWr]]
+    ; CHECK: RET_ReallyLR implicit $s0
+    %0:fpr(s32) = COPY $s0
+    %2:gpr(s32) = COPY %0(s32)
+    %1:gpr(s32) = G_CTLZ %2(s32)
+    $s0 = COPY %1(s32)
+    RET_ReallyLR implicit $s0
+
+...
+---
+name:            test_v16s8
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $q0
+
+    ; CHECK-LABEL: name: test_v16s8
+    ; CHECK: liveins: $q0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+    ; CHECK: [[CLZv16i8_:%[0-9]+]]:fpr128 = CLZv16i8 [[COPY]]
+    ; CHECK: $q0 = COPY [[CLZv16i8_]]
+    ; CHECK: RET_ReallyLR implicit $q0
+    %0:fpr(<16 x s8>) = COPY $q0
+    %1:fpr(<16 x s8>) = G_CTLZ %0(<16 x s8>)
+    $q0 = COPY %1(<16 x s8>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_v8s16
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $q0
+
+    ; CHECK-LABEL: name: test_v8s16
+    ; CHECK: liveins: $q0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+    ; CHECK: [[CLZv8i16_:%[0-9]+]]:fpr128 = CLZv8i16 [[COPY]]
+    ; CHECK: $q0 = COPY [[CLZv8i16_]]
+    ; CHECK: RET_ReallyLR implicit $q0
+    %0:fpr(<8 x s16>) = COPY $q0
+    %1:fpr(<8 x s16>) = G_CTLZ %0(<8 x s16>)
+    $q0 = COPY %1(<8 x s16>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_v4s32
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $q0
+
+    ; CHECK-LABEL: name: test_v4s32
+    ; CHECK: liveins: $q0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+    ; CHECK: [[CLZv4i32_:%[0-9]+]]:fpr128 = CLZv4i32 [[COPY]]
+    ; CHECK: $q0 = COPY [[CLZv4i32_]]
+    ; CHECK: RET_ReallyLR implicit $q0
+    %0:fpr(<4 x s32>) = COPY $q0
+    %1:fpr(<4 x s32>) = G_CTLZ %0(<4 x s32>)
+    $q0 = COPY %1(<4 x s32>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_v2s64
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $q0
+    ; CHECK-LABEL: name: test_v2s64
+    ; CHECK: liveins: $q0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr(<2 x s64>) = COPY $q0
+    ; CHECK: [[CTLZ:%[0-9]+]]:fpr(<2 x s64>) = G_CTLZ [[COPY]](<2 x s64>)
+    ; CHECK: $q0 = COPY [[CTLZ]](<2 x s64>)
+    ; CHECK: RET_ReallyLR implicit $q0
+    %0:fpr(<2 x s64>) = COPY $q0
+    %1:fpr(<2 x s64>) = G_CTLZ %0(<2 x s64>)
+    $q0 = COPY %1(<2 x s64>)
+    RET_ReallyLR implicit $q0
diff --git a/test/CodeGen/AArch64/arm64-clrsb.ll b/test/CodeGen/AArch64/arm64-clrsb.ll
index 02368cb..64673f2 100644
--- a/test/CodeGen/AArch64/arm64-clrsb.ll
+++ b/test/CodeGen/AArch64/arm64-clrsb.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=arm64-apple-ios7.0.0 |  FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0.0 -O0 -pass-remarks-missed=gisel* -global-isel-abort=2 |  FileCheck %s --check-prefixes=GISEL,FALLBACK
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
@@ -7,6 +8,7 @@
 declare i64 @llvm.ctlz.i64(i64, i1) #1
 
 ; Function Attrs: nounwind ssp
+; FALLBACK-NOT: remark{{.*}}clrsb32
 define i32 @clrsb32(i32 %x) #2 {
 entry:
   %shr = ashr i32 %x, 31
@@ -18,9 +20,15 @@
   ret i32 %0
 ; CHECK-LABEL: clrsb32
 ; CHECK:   cls [[TEMP:w[0-9]+]], [[TEMP]]
+
+; FIXME: We should produce the same result here to save some code size. After
+; that, we can remove the GISEL special casing.
+; GISEL-LABEL: clrsb32
+; GISEL: clz
 }
 
 ; Function Attrs: nounwind ssp
+; FALLBACK-NOT: remark{{.*}}clrsb64
 define i64 @clrsb64(i64 %x) #3 {
 entry:
   %shr = ashr i64 %x, 63
@@ -32,4 +40,6 @@
   ret i64 %0
 ; CHECK-LABEL: clrsb64
 ; CHECK:   cls [[TEMP:x[0-9]+]], [[TEMP]]
+; GISEL-LABEL: clrsb64
+; GISEL:   cls [[TEMP:x[0-9]+]], [[TEMP]]
 }
diff --git a/test/CodeGen/AArch64/arm64-vclz.ll b/test/CodeGen/AArch64/arm64-vclz.ll
index 016df56..38c0572 100644
--- a/test/CodeGen/AArch64/arm64-vclz.ll
+++ b/test/CodeGen/AArch64/arm64-vclz.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
 
+; FALLBACK-NOT: remark{{.*}}test_vclz_u8
 define <8 x i8> @test_vclz_u8(<8 x i8> %a) nounwind readnone ssp {
   ; CHECK-LABEL: test_vclz_u8:
   ; CHECK: clz.8b v0, v0
@@ -8,6 +10,7 @@
   ret <8 x i8> %vclz.i
 }
 
+; FALLBACK-NOT: remark{{.*}}test_vclz_s8
 define <8 x i8> @test_vclz_s8(<8 x i8> %a) nounwind readnone ssp {
   ; CHECK-LABEL: test_vclz_s8:
   ; CHECK: clz.8b v0, v0
@@ -16,6 +19,7 @@
   ret <8 x i8> %vclz.i
 }
 
+; FALLBACK-NOT: remark{{.*}}test_vclz_u16
 define <4 x i16> @test_vclz_u16(<4 x i16> %a) nounwind readnone ssp {
   ; CHECK-LABEL: test_vclz_u16:
   ; CHECK: clz.4h v0, v0
@@ -24,6 +28,7 @@
   ret <4 x i16> %vclz1.i
 }
 
+; FALLBACK-NOT: remark{{.*}}test_vclz_s16
 define <4 x i16> @test_vclz_s16(<4 x i16> %a) nounwind readnone ssp {
   ; CHECK-LABEL: test_vclz_s16:
   ; CHECK: clz.4h v0, v0
@@ -32,6 +37,7 @@
   ret <4 x i16> %vclz1.i
 }
 
+; FALLBACK-NOT: remark{{.*}}test_vclz_u32
 define <2 x i32> @test_vclz_u32(<2 x i32> %a) nounwind readnone ssp {
   ; CHECK-LABEL: test_vclz_u32:
   ; CHECK: clz.2s v0, v0
@@ -40,6 +46,7 @@
   ret <2 x i32> %vclz1.i
 }
 
+; FALLBACK-NOT: remark{{.*}}test_vclz_s32
 define <2 x i32> @test_vclz_s32(<2 x i32> %a) nounwind readnone ssp {
   ; CHECK-LABEL: test_vclz_s32:
   ; CHECK: clz.2s v0, v0
@@ -48,18 +55,21 @@
   ret <2 x i32> %vclz1.i
 }
 
+; FALLBACK-NOT: remark{{.*}}test_vclz_u64
 define <1 x i64> @test_vclz_u64(<1 x i64> %a) nounwind readnone ssp {
   ; CHECK-LABEL: test_vclz_u64:
   %vclz1.i = tail call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %a, i1 false) nounwind
   ret <1 x i64> %vclz1.i
 }
 
+; FALLBACK-NOT: remark{{.*}}test_vclz_s64
 define <1 x i64> @test_vclz_s64(<1 x i64> %a) nounwind readnone ssp {
   ; CHECK-LABEL: test_vclz_s64:
   %vclz1.i = tail call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %a, i1 false) nounwind
   ret <1 x i64> %vclz1.i
 }
 
+; FALLBACK-NOT: remark{{.*}}test_vclzq_u8
 define <16 x i8> @test_vclzq_u8(<16 x i8> %a) nounwind readnone ssp {
   ; CHECK-LABEL: test_vclzq_u8:
   ; CHECK: clz.16b v0, v0
@@ -68,6 +78,7 @@
   ret <16 x i8> %vclz.i
 }
 
+; FALLBACK-NOT: remark{{.*}}test_vclzq_s8
 define <16 x i8> @test_vclzq_s8(<16 x i8> %a) nounwind readnone ssp {
   ; CHECK-LABEL: test_vclzq_s8:
   ; CHECK: clz.16b v0, v0
@@ -76,6 +87,7 @@
   ret <16 x i8> %vclz.i
 }
 
+; FALLBACK-NOT: remark{{.*}}test_vclzq_u16
 define <8 x i16> @test_vclzq_u16(<8 x i16> %a) nounwind readnone ssp {
   ; CHECK-LABEL: test_vclzq_u16:
   ; CHECK: clz.8h v0, v0
@@ -84,6 +96,7 @@
   ret <8 x i16> %vclz1.i
 }
 
+; FALLBACK-NOT: remark{{.*}}test_vclzq_s16
 define <8 x i16> @test_vclzq_s16(<8 x i16> %a) nounwind readnone ssp {
   ; CHECK-LABEL: test_vclzq_s16:
   ; CHECK: clz.8h v0, v0
@@ -92,6 +105,7 @@
   ret <8 x i16> %vclz1.i
 }
 
+; FALLBACK-NOT: remark{{.*}}test_vclzq_u32
 define <4 x i32> @test_vclzq_u32(<4 x i32> %a) nounwind readnone ssp {
   ; CHECK-LABEL: test_vclzq_u32:
   ; CHECK: clz.4s v0, v0
@@ -100,6 +114,7 @@
   ret <4 x i32> %vclz1.i
 }
 
+; FALLBACK-NOT: remark{{.*}}test_vclzq_s32
 define <4 x i32> @test_vclzq_s32(<4 x i32> %a) nounwind readnone ssp {
   ; CHECK-LABEL: test_vclzq_s32:
   ; CHECK: clz.4s v0, v0
@@ -108,12 +123,14 @@
   ret <4 x i32> %vclz1.i
 }
 
+; FALLBACK-NOT: remark{{.*}}test_vclzq_u64
 define <2 x i64> @test_vclzq_u64(<2 x i64> %a) nounwind readnone ssp {
   ; CHECK-LABEL: test_vclzq_u64:
   %vclz1.i = tail call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false) nounwind
   ret <2 x i64> %vclz1.i
 }
 
+; FALLBACK-NOT: remark{{.*}}test_vclzq_s64
 define <2 x i64> @test_vclzq_s64(<2 x i64> %a) nounwind readnone ssp {
   ; CHECK-LABEL: test_vclzq_s64:
   %vclz1.i = tail call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false) nounwind