| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 |
| ; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD |
| ; RUN: llc < %s -mtriple=aarch64 -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=CHECK,CHECK-GI |
| |
| ; @llvm.aarch64.cls must be directly translated into the 'cls' instruction |
| |
| define i32 @cls(i32 %t) { |
| ; CHECK-LABEL: cls: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: cls w0, w0 |
| ; CHECK-NEXT: ret |
| %cls.i = call i32 @llvm.aarch64.cls(i32 %t) |
| ret i32 %cls.i |
| } |
| |
| define i32 @cls64(i64 %t) { |
| ; CHECK-LABEL: cls64: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: cls x0, x0 |
| ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 |
| ; CHECK-NEXT: ret |
| %cls.i = call i32 @llvm.aarch64.cls64(i64 %t) |
| ret i32 %cls.i |
| } |
| |
| declare i32 @llvm.aarch64.cls(i32) nounwind |
| declare i32 @llvm.aarch64.cls64(i64) nounwind |
| |
| define i8 @cls_i8(i8 %x) { |
| ; CHECK-LABEL: cls_i8: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: sxtb w8, w0 |
| ; CHECK-NEXT: cls w8, w8 |
| ; CHECK-NEXT: sub w0, w8, #24 |
| ; CHECK-NEXT: ret |
| |
| %a = ashr i8 %x, 7 |
| %b = xor i8 %x, %a |
| %c = call i8 @llvm.ctlz.i8(i8 %b, i1 false) |
| %d = sub i8 %c, 1 |
| ret i8 %d |
| } |
| |
| ; The result is in the range [1-31], so we don't need an andi after the cls. |
| define i32 @cls_i32_knownbits(i32 %x) { |
| ; CHECK-LABEL: cls_i32_knownbits: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: cls w0, w0 |
| ; CHECK-NEXT: ret |
| %a = ashr i32 %x, 31 |
| %b = xor i32 %x, %a |
| %c = call i32 @llvm.ctlz.i32(i32 %b, i1 false) |
| %d = sub i32 %c, 1 |
| %e = and i32 %d, 31 |
| ret i32 %e |
| } |
| |
| ; There are at least 16 redundant sign bits so we don't need an ori after the cls. |
| define i32 @cls_i32_knownbits_2(i16 signext %x) { |
| ; CHECK-LABEL: cls_i32_knownbits_2: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: cls w0, w0 |
| ; CHECK-NEXT: ret |
| %sext = sext i16 %x to i32 |
| %a = ashr i32 %sext, 31 |
| %b = xor i32 %sext, %a |
| %c = call i32 @llvm.ctlz.i32(i32 %b, i1 false) |
| %d = sub i32 %c, 1 |
| %e = or i32 %d, 16 |
| ret i32 %e |
| } |
| |
| ; Check that the range max in ctls cls knownbits |
| ; is not set to 32 |
| define i64 @cls_i64_not_32(i64 %x) { |
| ; CHECK-SD-LABEL: cls_i64_not_32: |
| ; CHECK-SD: // %bb.0: |
| ; CHECK-SD-NEXT: asr x8, x0, #16 |
| ; CHECK-SD-NEXT: cls x8, x8 |
| ; CHECK-SD-NEXT: orr x0, x8, #0x10 |
| ; CHECK-SD-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: cls_i64_not_32: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: asr x8, x0, #63 |
| ; CHECK-GI-NEXT: eor x8, x8, x0, asr #16 |
| ; CHECK-GI-NEXT: lsl x8, x8, #1 |
| ; CHECK-GI-NEXT: orr x8, x8, #0x1 |
| ; CHECK-GI-NEXT: clz x8, x8 |
| ; CHECK-GI-NEXT: orr x0, x8, #0x10 |
| ; CHECK-GI-NEXT: ret |
| %val = ashr i64 %x, 16 |
| %a = ashr i64 %val, 63 |
| %b = xor i64 %val, %a |
| %c = shl i64 %b, 1 |
| %d = or i64 %c, 1 |
| %e = call i64 @llvm.ctlz.i64(i64 %d, i1 true) |
| %f = or i64 %e, 16 |
| ret i64 %f |
| } |
| |
| ; There are at least 24 redundant sign bits so we don't need an ori after the clsw. |
| define i32 @cls_i32_knownbits_3(i8 signext %x) { |
| ; CHECK-LABEL: cls_i32_knownbits_3: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: cls w0, w0 |
| ; CHECK-NEXT: ret |
| %sext = sext i8 %x to i32 |
| %a = ashr i32 %sext, 31 |
| %b = xor i32 %sext, %a |
| %c = call i32 @llvm.ctlz.i32(i32 %b, i1 false) |
| %d = sub i32 %c, 1 |
| %e = or i32 %d, 24 |
| ret i32 %e |
| } |
| |
| ; Negative test. We only know there is at least 1 redundant sign bit. We can't |
| ; remove the ori. |
| define i32 @cls_i32_knownbits_4(i32 signext %x) { |
| ; CHECK-LABEL: cls_i32_knownbits_4: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: sbfx w8, w0, #0, #31 |
| ; CHECK-NEXT: cls w8, w8 |
| ; CHECK-NEXT: orr w0, w8, #0x1 |
| ; CHECK-NEXT: ret |
| %shl = shl i32 %x, 1 |
| %ashr = ashr i32 %shl, 1 |
| %a = ashr i32 %ashr, 31 |
| %b = xor i32 %ashr, %a |
| %c = call i32 @llvm.ctlz.i32(i32 %b, i1 false) |
| %d = sub i32 %c, 1 |
| %e = or i32 %d, 1 |
| ret i32 %e |
| } |
| |
| ; Negative test. Check that the number of sign bits is not |
| ; overestimated. If it is, the orr disappears. |
| define i32 @cls_i32_knownbits_no_overestimate(i32 signext %x) { |
| ; CHECK-SD-LABEL: cls_i32_knownbits_no_overestimate: |
| ; CHECK-SD: // %bb.0: |
| ; CHECK-SD-NEXT: asr w8, w0, #15 |
| ; CHECK-SD-NEXT: cls w8, w8 |
| ; CHECK-SD-NEXT: orr w0, w8, #0x10 |
| ; CHECK-SD-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: cls_i32_knownbits_no_overestimate: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: asr w8, w0, #31 |
| ; CHECK-GI-NEXT: eor w8, w8, w0, asr #15 |
| ; CHECK-GI-NEXT: clz w8, w8 |
| ; CHECK-GI-NEXT: sub w8, w8, #1 |
| ; CHECK-GI-NEXT: orr w0, w8, #0x10 |
| ; CHECK-GI-NEXT: ret |
| %ashr = ashr i32 %x, 15 |
| %a = ashr i32 %ashr, 31 |
| %b = xor i32 %ashr, %a |
| %c = call i32 @llvm.ctlz.i32(i32 %b, i1 false) |
| %d = sub i32 %c, 1 |
| %e = or i32 %d, 16 |
| ret i32 %e |
| } |
| |
| ; Test that computeKnownBits works with NEON CLS intrinsics now that they use ISD::CTLS. |
| ; The CLS result for v4i32 is in range [0, 31], so the `and 31` should be optimized away. |
| define <4 x i32> @neon_cls_v4i32_knownbits(<4 x i32> %a) nounwind { |
| ; CHECK-LABEL: neon_cls_v4i32_knownbits: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: cls v0.4s, v0.4s |
| ; CHECK-NEXT: ret |
| %result = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> %a) |
| %and = and <4 x i32> %result, <i32 31, i32 31, i32 31, i32 31> |
| ret <4 x i32> %and |
| } |
| |
| declare <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32>) nounwind readnone |
| |
| ; Test ensures that the compiler generates no extra instructions |
| ; for __builtin_clzg output type conversion |
| define i32 @foo8(i8 %0) { |
| ; CHECK-SD-LABEL: foo8: |
| ; CHECK-SD: // %bb.0: |
| ; CHECK-SD-NEXT: and w8, w0, #0xff |
| ; CHECK-SD-NEXT: clz w8, w8 |
| ; CHECK-SD-NEXT: sub w0, w8, #24 |
| ; CHECK-SD-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: foo8: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: and w8, w0, #0xff |
| ; CHECK-GI-NEXT: clz w8, w8 |
| ; CHECK-GI-NEXT: sub w8, w8, #24 |
| ; CHECK-GI-NEXT: and w0, w8, #0xff |
| ; CHECK-GI-NEXT: ret |
| %2 = tail call i8 @llvm.ctlz.i8(i8 %0, i1 false) |
| %3 = zext nneg i8 %2 to i32 |
| ret i32 %3 |
| } |
| |
| ; Test ensures that the compiler generates no extra instructions |
| ; for __builtin_clzg output type conversion |
| define i32 @foo16(i16 %0) { |
| ; CHECK-SD-LABEL: foo16: |
| ; CHECK-SD: // %bb.0: |
| ; CHECK-SD-NEXT: and w8, w0, #0xffff |
| ; CHECK-SD-NEXT: clz w8, w8 |
| ; CHECK-SD-NEXT: sub w0, w8, #16 |
| ; CHECK-SD-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: foo16: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: and w8, w0, #0xffff |
| ; CHECK-GI-NEXT: clz w8, w8 |
| ; CHECK-GI-NEXT: sub w8, w8, #16 |
| ; CHECK-GI-NEXT: and w0, w8, #0xffff |
| ; CHECK-GI-NEXT: ret |
| %2 = tail call i16 @llvm.ctlz.i16(i16 %0, i1 false) |
| %3 = zext nneg i16 %2 to i32 |
| ret i32 %3 |
| } |