llvm/test/CodeGen/AArch64/neon-anyof-splat.ll - llvm-project - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc %s -o - | FileCheck %s
 target triple = "aarch64-linux-gnu"

 ;; An 'AnyOf' reduction (vector.reduce.or) is instcombined to a bitcast to an
 ;; integer of a bitwidth equal to the number of lanes being reduced, then
 ;; compared against zero. To select between vectors for NEON, we then need to
 ;; broadcast the result, but we must be careful when the bitwidth of the scalar
 ;; result is smaller than the element size of the vectors being selected. We
 ;; don't want to end up with scalarization.

 define <4 x i32> @any_of_select_vf4(<4 x i32> %mask, <4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: any_of_select_vf4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
 ; CHECK-NEXT:    umaxv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    tst w8, #0x1
 ; CHECK-NEXT:    csetm w8, ne
 ; CHECK-NEXT:    dup v0.4s, w8
 ; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
 ; CHECK-NEXT:    ret
   %cmp = icmp slt <4 x i32> %mask, zeroinitializer
   %cmp.bc = bitcast <4 x i1> %cmp to i4
   %cmp.bc.not = icmp eq i4 %cmp.bc, 0
   %res = select i1 %cmp.bc.not, <4 x i32> %a, <4 x i32> %b
   ret <4 x i32> %res
 }

 define <2 x i64> @any_of_select_vf2(<2 x i64> %mask, <2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: any_of_select_vf2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
 ; CHECK-NEXT:    umaxv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    tst w8, #0x1
 ; CHECK-NEXT:    csetm x8, ne
 ; CHECK-NEXT:    dup v0.2d, x8
 ; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
 ; CHECK-NEXT:    ret
   %cmp = icmp slt <2 x i64> %mask, zeroinitializer
   %cmp.bc = bitcast <2 x i1> %cmp to i2
   %cmp.bc.not = icmp eq i2 %cmp.bc, 0
   %res = select i1 %cmp.bc.not, <2 x i64> %a, <2 x i64> %b
   ret <2 x i64> %res
 }

 define <32 x i8> @any_of_select_vf32(<32 x i8> %mask, <32 x i8> %a, <32 x i8> %b) {
 ; CHECK-LABEL: any_of_select_vf32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
 ; CHECK-NEXT:    umaxv b0, v0.16b
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    tst w8, #0x1
 ; CHECK-NEXT:    csetm w8, ne
 ; CHECK-NEXT:    dup v1.16b, w8
 ; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    bsl v1.16b, v5.16b, v3.16b
 ; CHECK-NEXT:    bsl v0.16b, v4.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cmp = icmp slt <32 x i8> %mask, zeroinitializer
   %cmp.bc = bitcast <32 x i1> %cmp to i32
   %cmp.bc.not = icmp eq i32 %cmp.bc, 0
   %res = select i1 %cmp.bc.not, <32 x i8> %a, <32 x i8> %b
   ret <32 x i8> %res
 }
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
	; RUN: llc %s -o - \| FileCheck %s
	target triple = "aarch64-linux-gnu"

	;; An 'AnyOf' reduction (vector.reduce.or) is instcombined to a bitcast to an
	;; integer of a bitwidth equal to the number of lanes being reduced, then
	;; compared against zero. To select between vectors for NEON, we then need to
	;; broadcast the result, but we must be careful when the bitwidth of the scalar
	;; result is smaller than the element size of the vectors being selected. We
	;; don't want to end up with scalarization.

	define <4 x i32> @any_of_select_vf4(<4 x i32> %mask, <4 x i32> %a, <4 x i32> %b) {
	; CHECK-LABEL: any_of_select_vf4:
	; CHECK: // %bb.0:
	; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
	; CHECK-NEXT: umaxv s0, v0.4s
	; CHECK-NEXT: fmov w8, s0
	; CHECK-NEXT: tst w8, #0x1
	; CHECK-NEXT: csetm w8, ne
	; CHECK-NEXT: dup v0.4s, w8
	; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b
	; CHECK-NEXT: ret
	%cmp = icmp slt <4 x i32> %mask, zeroinitializer
	%cmp.bc = bitcast <4 x i1> %cmp to i4
	%cmp.bc.not = icmp eq i4 %cmp.bc, 0
	%res = select i1 %cmp.bc.not, <4 x i32> %a, <4 x i32> %b
	ret <4 x i32> %res
	}

	define <2 x i64> @any_of_select_vf2(<2 x i64> %mask, <2 x i64> %a, <2 x i64> %b) {
	; CHECK-LABEL: any_of_select_vf2:
	; CHECK: // %bb.0:
	; CHECK-NEXT: cmlt v0.2d, v0.2d, #0
	; CHECK-NEXT: umaxv s0, v0.4s
	; CHECK-NEXT: fmov w8, s0
	; CHECK-NEXT: tst w8, #0x1
	; CHECK-NEXT: csetm x8, ne
	; CHECK-NEXT: dup v0.2d, x8
	; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b
	; CHECK-NEXT: ret
	%cmp = icmp slt <2 x i64> %mask, zeroinitializer
	%cmp.bc = bitcast <2 x i1> %cmp to i2
	%cmp.bc.not = icmp eq i2 %cmp.bc, 0
	%res = select i1 %cmp.bc.not, <2 x i64> %a, <2 x i64> %b
	ret <2 x i64> %res
	}

	define <32 x i8> @any_of_select_vf32(<32 x i8> %mask, <32 x i8> %a, <32 x i8> %b) {
	; CHECK-LABEL: any_of_select_vf32:
	; CHECK: // %bb.0:
	; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
	; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
	; CHECK-NEXT: umaxv b0, v0.16b
	; CHECK-NEXT: fmov w8, s0
	; CHECK-NEXT: tst w8, #0x1
	; CHECK-NEXT: csetm w8, ne
	; CHECK-NEXT: dup v1.16b, w8
	; CHECK-NEXT: mov v0.16b, v1.16b
	; CHECK-NEXT: bsl v1.16b, v5.16b, v3.16b
	; CHECK-NEXT: bsl v0.16b, v4.16b, v2.16b
	; CHECK-NEXT: ret
	%cmp = icmp slt <32 x i8> %mask, zeroinitializer
	%cmp.bc = bitcast <32 x i1> %cmp to i32
	%cmp.bc.not = icmp eq i32 %cmp.bc, 0
	%res = select i1 %cmp.bc.not, <32 x i8> %a, <32 x i8> %b
	ret <32 x i8> %res
	}