llvm/test/CodeGen/AArch64/cls.ll - llvm-project.git - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc < %s -mtriple=aarch64 -global-isel -global-isel-abort=2 |  FileCheck %s --check-prefixes=CHECK,CHECK-GI

 ; @llvm.aarch64.cls must be directly translated into the 'cls' instruction

 define i32 @cls(i32 %t) {
 ; CHECK-LABEL: cls:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cls w0, w0
 ; CHECK-NEXT:    ret
   %cls.i = call i32 @llvm.aarch64.cls(i32 %t)
   ret i32 %cls.i
 }

 define i32 @cls64(i64 %t) {
 ; CHECK-LABEL: cls64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cls x0, x0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
   %cls.i = call i32 @llvm.aarch64.cls64(i64 %t)
   ret i32 %cls.i
 }

 declare i32 @llvm.aarch64.cls(i32) nounwind
 declare i32 @llvm.aarch64.cls64(i64) nounwind

 define i8 @cls_i8(i8 %x) {
 ; CHECK-LABEL: cls_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sxtb w8, w0
 ; CHECK-NEXT:    cls w8, w8
 ; CHECK-NEXT:    sub w0, w8, #24
 ; CHECK-NEXT:    ret

   %a = ashr i8 %x, 7
   %b = xor i8 %x, %a
   %c = call i8 @llvm.ctlz.i8(i8 %b, i1 false)
   %d = sub i8 %c, 1
   ret i8 %d
 }

 ; The result is in the range [1-31], so we don't need an andi after the cls.
 define i32 @cls_i32_knownbits(i32 %x) {
 ; CHECK-LABEL: cls_i32_knownbits:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cls w0, w0
 ; CHECK-NEXT:    ret
   %a = ashr i32 %x, 31
   %b = xor i32 %x, %a
   %c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
   %d = sub i32 %c, 1
   %e = and i32 %d, 31
   ret i32 %e
 }

 ; There are at least 16 redundant sign bits so we don't need an ori after the cls.
 define i32 @cls_i32_knownbits_2(i16 signext %x) {
 ; CHECK-LABEL: cls_i32_knownbits_2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cls w0, w0
 ; CHECK-NEXT:    ret
   %sext = sext i16 %x to i32
   %a = ashr i32 %sext, 31
   %b = xor i32 %sext, %a
   %c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
   %d = sub i32 %c, 1
   %e = or i32 %d, 16
   ret i32 %e
 }

 ; Check that the range max in ctls cls knownbits
 ; is not set to 32
 define i64 @cls_i64_not_32(i64 %x) {
 ; CHECK-SD-LABEL: cls_i64_not_32:
 ; CHECK-SD:       // %bb.0:
 ; CHECK-SD-NEXT:    asr x8, x0, #16
 ; CHECK-SD-NEXT:    cls x8, x8
 ; CHECK-SD-NEXT:    orr x0, x8, #0x10
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: cls_i64_not_32:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    asr x8, x0, #63
 ; CHECK-GI-NEXT:    eor x8, x8, x0, asr #16
 ; CHECK-GI-NEXT:    lsl x8, x8, #1
 ; CHECK-GI-NEXT:    orr x8, x8, #0x1
 ; CHECK-GI-NEXT:    clz x8, x8
 ; CHECK-GI-NEXT:    orr x0, x8, #0x10
 ; CHECK-GI-NEXT:    ret
   %val = ashr i64 %x, 16
   %a = ashr i64 %val, 63
   %b = xor i64 %val, %a
   %c = shl i64 %b, 1
   %d = or i64 %c, 1
   %e = call i64 @llvm.ctlz.i64(i64 %d, i1 true)
   %f = or i64 %e, 16
   ret i64 %f
 }

 ; There are at least 24 redundant sign bits so we don't need an ori after the clsw.
 define i32 @cls_i32_knownbits_3(i8 signext %x) {
 ; CHECK-LABEL: cls_i32_knownbits_3:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cls w0, w0
 ; CHECK-NEXT:    ret
   %sext = sext i8 %x to i32
   %a = ashr i32 %sext, 31
   %b = xor i32 %sext, %a
   %c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
   %d = sub i32 %c, 1
   %e = or i32 %d, 24
   ret i32 %e
 }

 ; Negative test. We only know there is at least 1 redundant sign bit. We can't
 ; remove the ori.
 define i32 @cls_i32_knownbits_4(i32 signext %x) {
 ; CHECK-LABEL: cls_i32_knownbits_4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sbfx w8, w0, #0, #31
 ; CHECK-NEXT:    cls w8, w8
 ; CHECK-NEXT:    orr w0, w8, #0x1
 ; CHECK-NEXT:    ret
   %shl = shl i32 %x, 1
   %ashr = ashr i32 %shl, 1
   %a = ashr i32 %ashr, 31
   %b = xor i32 %ashr, %a
   %c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
   %d = sub i32 %c, 1
   %e = or i32 %d, 1
   ret i32 %e
  }

 ; Negative test. Check that the number of sign bits is not
 ; overestimated. If it is, the orr disappears.
 define i32 @cls_i32_knownbits_no_overestimate(i32 signext %x) {
 ; CHECK-SD-LABEL: cls_i32_knownbits_no_overestimate:
 ; CHECK-SD:       // %bb.0:
 ; CHECK-SD-NEXT:    asr w8, w0, #15
 ; CHECK-SD-NEXT:    cls w8, w8
 ; CHECK-SD-NEXT:    orr w0, w8, #0x10
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: cls_i32_knownbits_no_overestimate:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    asr w8, w0, #31
 ; CHECK-GI-NEXT:    eor w8, w8, w0, asr #15
 ; CHECK-GI-NEXT:    clz w8, w8
 ; CHECK-GI-NEXT:    sub w8, w8, #1
 ; CHECK-GI-NEXT:    orr w0, w8, #0x10
 ; CHECK-GI-NEXT:    ret
   %ashr = ashr i32 %x, 15
   %a = ashr i32 %ashr, 31
   %b = xor i32 %ashr, %a
   %c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
   %d = sub i32 %c, 1
   %e = or i32 %d, 16
   ret i32 %e
  }

 ; Test that computeKnownBits works with NEON CLS intrinsics now that they use ISD::CTLS.
 ; The CLS result for v4i32 is in range [0, 31], so the `and 31` should be optimized away.
 define <4 x i32> @neon_cls_v4i32_knownbits(<4 x i32> %a) nounwind {
 ; CHECK-LABEL: neon_cls_v4i32_knownbits:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cls v0.4s, v0.4s
 ; CHECK-NEXT:    ret
   %result = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> %a)
   %and = and <4 x i32> %result, <i32 31, i32 31, i32 31, i32 31>
   ret <4 x i32> %and
 }

 declare <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32>) nounwind readnone

 ; Test ensures that the compiler generates no extra instructions
 ; for __builtin_clzg output type conversion
 define i32 @foo8(i8 %0) {
 ; CHECK-SD-LABEL: foo8:
 ; CHECK-SD:       // %bb.0:
 ; CHECK-SD-NEXT:    and w8, w0, #0xff
 ; CHECK-SD-NEXT:    clz w8, w8
 ; CHECK-SD-NEXT:    sub w0, w8, #24
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: foo8:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    and w8, w0, #0xff
 ; CHECK-GI-NEXT:    clz w8, w8
 ; CHECK-GI-NEXT:    sub w8, w8, #24
 ; CHECK-GI-NEXT:    and w0, w8, #0xff
 ; CHECK-GI-NEXT:    ret
   %2 = tail call i8 @llvm.ctlz.i8(i8 %0, i1 false)
   %3 = zext nneg i8 %2 to i32
   ret i32 %3
 }

 ; Test ensures that the compiler generates no extra instructions
 ; for __builtin_clzg output type conversion
 define i32 @foo16(i16 %0) {
 ; CHECK-SD-LABEL: foo16:
 ; CHECK-SD:       // %bb.0:
 ; CHECK-SD-NEXT:    and w8, w0, #0xffff
 ; CHECK-SD-NEXT:    clz w8, w8
 ; CHECK-SD-NEXT:    sub w0, w8, #16
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: foo16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    and w8, w0, #0xffff
 ; CHECK-GI-NEXT:    clz w8, w8
 ; CHECK-GI-NEXT:    sub w8, w8, #16
 ; CHECK-GI-NEXT:    and w0, w8, #0xffff
 ; CHECK-GI-NEXT:    ret
   %2 = tail call i16 @llvm.ctlz.i16(i16 %0, i1 false)
   %3 = zext nneg i16 %2 to i32
   ret i32 %3
 }
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
	; RUN: llc -mtriple=aarch64 %s -o - \| FileCheck %s --check-prefixes=CHECK,CHECK-SD
	; RUN: llc < %s -mtriple=aarch64 -global-isel -global-isel-abort=2 \| FileCheck %s --check-prefixes=CHECK,CHECK-GI

	; @llvm.aarch64.cls must be directly translated into the 'cls' instruction

	define i32 @cls(i32 %t) {
	; CHECK-LABEL: cls:
	; CHECK: // %bb.0:
	; CHECK-NEXT: cls w0, w0
	; CHECK-NEXT: ret
	%cls.i = call i32 @llvm.aarch64.cls(i32 %t)
	ret i32 %cls.i
	}

	define i32 @cls64(i64 %t) {
	; CHECK-LABEL: cls64:
	; CHECK: // %bb.0:
	; CHECK-NEXT: cls x0, x0
	; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
	; CHECK-NEXT: ret
	%cls.i = call i32 @llvm.aarch64.cls64(i64 %t)
	ret i32 %cls.i
	}

	declare i32 @llvm.aarch64.cls(i32) nounwind
	declare i32 @llvm.aarch64.cls64(i64) nounwind

	define i8 @cls_i8(i8 %x) {
	; CHECK-LABEL: cls_i8:
	; CHECK: // %bb.0:
	; CHECK-NEXT: sxtb w8, w0
	; CHECK-NEXT: cls w8, w8
	; CHECK-NEXT: sub w0, w8, #24
	; CHECK-NEXT: ret

	%a = ashr i8 %x, 7
	%b = xor i8 %x, %a
	%c = call i8 @llvm.ctlz.i8(i8 %b, i1 false)
	%d = sub i8 %c, 1
	ret i8 %d
	}

	; The result is in the range [1-31], so we don't need an andi after the cls.
	define i32 @cls_i32_knownbits(i32 %x) {
	; CHECK-LABEL: cls_i32_knownbits:
	; CHECK: // %bb.0:
	; CHECK-NEXT: cls w0, w0
	; CHECK-NEXT: ret
	%a = ashr i32 %x, 31
	%b = xor i32 %x, %a
	%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
	%d = sub i32 %c, 1
	%e = and i32 %d, 31
	ret i32 %e
	}

	; There are at least 16 redundant sign bits so we don't need an ori after the cls.
	define i32 @cls_i32_knownbits_2(i16 signext %x) {
	; CHECK-LABEL: cls_i32_knownbits_2:
	; CHECK: // %bb.0:
	; CHECK-NEXT: cls w0, w0
	; CHECK-NEXT: ret
	%sext = sext i16 %x to i32
	%a = ashr i32 %sext, 31
	%b = xor i32 %sext, %a
	%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
	%d = sub i32 %c, 1
	%e = or i32 %d, 16
	ret i32 %e
	}

	; Check that the range max in ctls cls knownbits
	; is not set to 32
	define i64 @cls_i64_not_32(i64 %x) {
	; CHECK-SD-LABEL: cls_i64_not_32:
	; CHECK-SD: // %bb.0:
	; CHECK-SD-NEXT: asr x8, x0, #16
	; CHECK-SD-NEXT: cls x8, x8
	; CHECK-SD-NEXT: orr x0, x8, #0x10
	; CHECK-SD-NEXT: ret
	;
	; CHECK-GI-LABEL: cls_i64_not_32:
	; CHECK-GI: // %bb.0:
	; CHECK-GI-NEXT: asr x8, x0, #63
	; CHECK-GI-NEXT: eor x8, x8, x0, asr #16
	; CHECK-GI-NEXT: lsl x8, x8, #1
	; CHECK-GI-NEXT: orr x8, x8, #0x1
	; CHECK-GI-NEXT: clz x8, x8
	; CHECK-GI-NEXT: orr x0, x8, #0x10
	; CHECK-GI-NEXT: ret
	%val = ashr i64 %x, 16
	%a = ashr i64 %val, 63
	%b = xor i64 %val, %a
	%c = shl i64 %b, 1
	%d = or i64 %c, 1
	%e = call i64 @llvm.ctlz.i64(i64 %d, i1 true)
	%f = or i64 %e, 16
	ret i64 %f
	}

	; There are at least 24 redundant sign bits so we don't need an ori after the clsw.
	define i32 @cls_i32_knownbits_3(i8 signext %x) {
	; CHECK-LABEL: cls_i32_knownbits_3:
	; CHECK: // %bb.0:
	; CHECK-NEXT: cls w0, w0
	; CHECK-NEXT: ret
	%sext = sext i8 %x to i32
	%a = ashr i32 %sext, 31
	%b = xor i32 %sext, %a
	%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
	%d = sub i32 %c, 1
	%e = or i32 %d, 24
	ret i32 %e
	}

	; Negative test. We only know there is at least 1 redundant sign bit. We can't
	; remove the ori.
	define i32 @cls_i32_knownbits_4(i32 signext %x) {
	; CHECK-LABEL: cls_i32_knownbits_4:
	; CHECK: // %bb.0:
	; CHECK-NEXT: sbfx w8, w0, #0, #31
	; CHECK-NEXT: cls w8, w8
	; CHECK-NEXT: orr w0, w8, #0x1
	; CHECK-NEXT: ret
	%shl = shl i32 %x, 1
	%ashr = ashr i32 %shl, 1
	%a = ashr i32 %ashr, 31
	%b = xor i32 %ashr, %a
	%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
	%d = sub i32 %c, 1
	%e = or i32 %d, 1
	ret i32 %e
	}

	; Negative test. Check that the number of sign bits is not
	; overestimated. If it is, the orr disappears.
	define i32 @cls_i32_knownbits_no_overestimate(i32 signext %x) {
	; CHECK-SD-LABEL: cls_i32_knownbits_no_overestimate:
	; CHECK-SD: // %bb.0:
	; CHECK-SD-NEXT: asr w8, w0, #15
	; CHECK-SD-NEXT: cls w8, w8
	; CHECK-SD-NEXT: orr w0, w8, #0x10
	; CHECK-SD-NEXT: ret
	;
	; CHECK-GI-LABEL: cls_i32_knownbits_no_overestimate:
	; CHECK-GI: // %bb.0:
	; CHECK-GI-NEXT: asr w8, w0, #31
	; CHECK-GI-NEXT: eor w8, w8, w0, asr #15
	; CHECK-GI-NEXT: clz w8, w8
	; CHECK-GI-NEXT: sub w8, w8, #1
	; CHECK-GI-NEXT: orr w0, w8, #0x10
	; CHECK-GI-NEXT: ret
	%ashr = ashr i32 %x, 15
	%a = ashr i32 %ashr, 31
	%b = xor i32 %ashr, %a
	%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
	%d = sub i32 %c, 1
	%e = or i32 %d, 16
	ret i32 %e
	}

	; Test that computeKnownBits works with NEON CLS intrinsics now that they use ISD::CTLS.
	; The CLS result for v4i32 is in range [0, 31], so the `and 31` should be optimized away.
	define <4 x i32> @neon_cls_v4i32_knownbits(<4 x i32> %a) nounwind {
	; CHECK-LABEL: neon_cls_v4i32_knownbits:
	; CHECK: // %bb.0:
	; CHECK-NEXT: cls v0.4s, v0.4s
	; CHECK-NEXT: ret
	%result = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> %a)
	%and = and <4 x i32> %result, <i32 31, i32 31, i32 31, i32 31>
	ret <4 x i32> %and
	}

	declare <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32>) nounwind readnone

	; Test ensures that the compiler generates no extra instructions
	; for __builtin_clzg output type conversion
	define i32 @foo8(i8 %0) {
	; CHECK-SD-LABEL: foo8:
	; CHECK-SD: // %bb.0:
	; CHECK-SD-NEXT: and w8, w0, #0xff
	; CHECK-SD-NEXT: clz w8, w8
	; CHECK-SD-NEXT: sub w0, w8, #24
	; CHECK-SD-NEXT: ret
	;
	; CHECK-GI-LABEL: foo8:
	; CHECK-GI: // %bb.0:
	; CHECK-GI-NEXT: and w8, w0, #0xff
	; CHECK-GI-NEXT: clz w8, w8
	; CHECK-GI-NEXT: sub w8, w8, #24
	; CHECK-GI-NEXT: and w0, w8, #0xff
	; CHECK-GI-NEXT: ret
	%2 = tail call i8 @llvm.ctlz.i8(i8 %0, i1 false)
	%3 = zext nneg i8 %2 to i32
	ret i32 %3
	}

	; Test ensures that the compiler generates no extra instructions
	; for __builtin_clzg output type conversion
	define i32 @foo16(i16 %0) {
	; CHECK-SD-LABEL: foo16:
	; CHECK-SD: // %bb.0:
	; CHECK-SD-NEXT: and w8, w0, #0xffff
	; CHECK-SD-NEXT: clz w8, w8
	; CHECK-SD-NEXT: sub w0, w8, #16
	; CHECK-SD-NEXT: ret
	;
	; CHECK-GI-LABEL: foo16:
	; CHECK-GI: // %bb.0:
	; CHECK-GI-NEXT: and w8, w0, #0xffff
	; CHECK-GI-NEXT: clz w8, w8
	; CHECK-GI-NEXT: sub w8, w8, #16
	; CHECK-GI-NEXT: and w0, w8, #0xffff
	; CHECK-GI-NEXT: ret
	%2 = tail call i16 @llvm.ctlz.i16(i16 %0, i1 false)
	%3 = zext nneg i16 %2 to i32
	ret i32 %3
	}