blob: cb0c0aa5ec2edee1fcf420c10cb496a59aafb76a [file] [log] [blame] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
; RUN: llc < %s -mtriple=aarch64 -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
; @llvm.aarch64.cls must be directly translated into the 'cls' instruction
define i32 @cls(i32 %t) {
; CHECK-LABEL: cls:
; CHECK: // %bb.0:
; CHECK-NEXT: cls w0, w0
; CHECK-NEXT: ret
%cls.i = call i32 @llvm.aarch64.cls(i32 %t)
ret i32 %cls.i
}
define i32 @cls64(i64 %t) {
; CHECK-LABEL: cls64:
; CHECK: // %bb.0:
; CHECK-NEXT: cls x0, x0
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: ret
%cls.i = call i32 @llvm.aarch64.cls64(i64 %t)
ret i32 %cls.i
}
declare i32 @llvm.aarch64.cls(i32) nounwind
declare i32 @llvm.aarch64.cls64(i64) nounwind
define i8 @cls_i8(i8 %x) {
; CHECK-LABEL: cls_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: sxtb w8, w0
; CHECK-NEXT: cls w8, w8
; CHECK-NEXT: sub w0, w8, #24
; CHECK-NEXT: ret
%a = ashr i8 %x, 7
%b = xor i8 %x, %a
%c = call i8 @llvm.ctlz.i8(i8 %b, i1 false)
%d = sub i8 %c, 1
ret i8 %d
}
; The result is in the range [1-31], so we don't need an andi after the cls.
define i32 @cls_i32_knownbits(i32 %x) {
; CHECK-LABEL: cls_i32_knownbits:
; CHECK: // %bb.0:
; CHECK-NEXT: cls w0, w0
; CHECK-NEXT: ret
%a = ashr i32 %x, 31
%b = xor i32 %x, %a
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
%d = sub i32 %c, 1
%e = and i32 %d, 31
ret i32 %e
}
; There are at least 16 redundant sign bits so we don't need an ori after the cls.
define i32 @cls_i32_knownbits_2(i16 signext %x) {
; CHECK-LABEL: cls_i32_knownbits_2:
; CHECK: // %bb.0:
; CHECK-NEXT: cls w0, w0
; CHECK-NEXT: ret
%sext = sext i16 %x to i32
%a = ashr i32 %sext, 31
%b = xor i32 %sext, %a
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
%d = sub i32 %c, 1
%e = or i32 %d, 16
ret i32 %e
}
; Check that the range max in ctls cls knownbits
; is not set to 32
define i64 @cls_i64_not_32(i64 %x) {
; CHECK-SD-LABEL: cls_i64_not_32:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: asr x8, x0, #16
; CHECK-SD-NEXT: cls x8, x8
; CHECK-SD-NEXT: orr x0, x8, #0x10
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: cls_i64_not_32:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: asr x8, x0, #63
; CHECK-GI-NEXT: eor x8, x8, x0, asr #16
; CHECK-GI-NEXT: lsl x8, x8, #1
; CHECK-GI-NEXT: orr x8, x8, #0x1
; CHECK-GI-NEXT: clz x8, x8
; CHECK-GI-NEXT: orr x0, x8, #0x10
; CHECK-GI-NEXT: ret
%val = ashr i64 %x, 16
%a = ashr i64 %val, 63
%b = xor i64 %val, %a
%c = shl i64 %b, 1
%d = or i64 %c, 1
%e = call i64 @llvm.ctlz.i64(i64 %d, i1 true)
%f = or i64 %e, 16
ret i64 %f
}
; There are at least 24 redundant sign bits so we don't need an ori after the clsw.
define i32 @cls_i32_knownbits_3(i8 signext %x) {
; CHECK-LABEL: cls_i32_knownbits_3:
; CHECK: // %bb.0:
; CHECK-NEXT: cls w0, w0
; CHECK-NEXT: ret
%sext = sext i8 %x to i32
%a = ashr i32 %sext, 31
%b = xor i32 %sext, %a
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
%d = sub i32 %c, 1
%e = or i32 %d, 24
ret i32 %e
}
; Negative test. We only know there is at least 1 redundant sign bit. We can't
; remove the ori.
define i32 @cls_i32_knownbits_4(i32 signext %x) {
; CHECK-LABEL: cls_i32_knownbits_4:
; CHECK: // %bb.0:
; CHECK-NEXT: sbfx w8, w0, #0, #31
; CHECK-NEXT: cls w8, w8
; CHECK-NEXT: orr w0, w8, #0x1
; CHECK-NEXT: ret
%shl = shl i32 %x, 1
%ashr = ashr i32 %shl, 1
%a = ashr i32 %ashr, 31
%b = xor i32 %ashr, %a
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
%d = sub i32 %c, 1
%e = or i32 %d, 1
ret i32 %e
}
; Negative test. Check that the number of sign bits is not
; overestimated. If it is, the orr disappears.
define i32 @cls_i32_knownbits_no_overestimate(i32 signext %x) {
; CHECK-SD-LABEL: cls_i32_knownbits_no_overestimate:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: asr w8, w0, #15
; CHECK-SD-NEXT: cls w8, w8
; CHECK-SD-NEXT: orr w0, w8, #0x10
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: cls_i32_knownbits_no_overestimate:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: asr w8, w0, #31
; CHECK-GI-NEXT: eor w8, w8, w0, asr #15
; CHECK-GI-NEXT: clz w8, w8
; CHECK-GI-NEXT: sub w8, w8, #1
; CHECK-GI-NEXT: orr w0, w8, #0x10
; CHECK-GI-NEXT: ret
%ashr = ashr i32 %x, 15
%a = ashr i32 %ashr, 31
%b = xor i32 %ashr, %a
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
%d = sub i32 %c, 1
%e = or i32 %d, 16
ret i32 %e
}
; Test that computeKnownBits works with NEON CLS intrinsics now that they use ISD::CTLS.
; The CLS result for v4i32 is in range [0, 31], so the `and 31` should be optimized away.
define <4 x i32> @neon_cls_v4i32_knownbits(<4 x i32> %a) nounwind {
; CHECK-LABEL: neon_cls_v4i32_knownbits:
; CHECK: // %bb.0:
; CHECK-NEXT: cls v0.4s, v0.4s
; CHECK-NEXT: ret
%result = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> %a)
%and = and <4 x i32> %result, <i32 31, i32 31, i32 31, i32 31>
ret <4 x i32> %and
}
declare <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32>) nounwind readnone
; Test ensures that the compiler generates no extra instructions
; for __builtin_clzg output type conversion
define i32 @foo8(i8 %0) {
; CHECK-SD-LABEL: foo8:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: and w8, w0, #0xff
; CHECK-SD-NEXT: clz w8, w8
; CHECK-SD-NEXT: sub w0, w8, #24
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: foo8:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: and w8, w0, #0xff
; CHECK-GI-NEXT: clz w8, w8
; CHECK-GI-NEXT: sub w8, w8, #24
; CHECK-GI-NEXT: and w0, w8, #0xff
; CHECK-GI-NEXT: ret
%2 = tail call i8 @llvm.ctlz.i8(i8 %0, i1 false)
%3 = zext nneg i8 %2 to i32
ret i32 %3
}
; Test ensures that the compiler generates no extra instructions
; for __builtin_clzg output type conversion
define i32 @foo16(i16 %0) {
; CHECK-SD-LABEL: foo16:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: and w8, w0, #0xffff
; CHECK-SD-NEXT: clz w8, w8
; CHECK-SD-NEXT: sub w0, w8, #16
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: foo16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: and w8, w0, #0xffff
; CHECK-GI-NEXT: clz w8, w8
; CHECK-GI-NEXT: sub w8, w8, #16
; CHECK-GI-NEXT: and w0, w8, #0xffff
; CHECK-GI-NEXT: ret
%2 = tail call i16 @llvm.ctlz.i16(i16 %0, i1 false)
%3 = zext nneg i16 %2 to i32
ret i32 %3
}