test/CodeGen/PowerPC/optimize-vector-not-equal.ll - llvm-project/llvm - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
 ; RUN:     -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_64LE

 ; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \
 ; RUN:     -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_64

 ; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc-ibm-aix \
 ; RUN:     -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_32

 ; The current implementation is comparing vector of non-zeros in register v2 with v3. v3 is then negated and converts:
 ; 0XFFFF -> 0
 ; 0 -> 1
 ; An optimized version is to follow this NFC patch

 define i32 @cols_needed(<4 x i16> %wide.load) {
 ; POWERPC_64LE-LABEL: cols_needed:
 ; POWERPC_64LE:       # %bb.0: # %entry
 ; POWERPC_64LE-NEXT:    xxlxor v3, v3, v3
 ; POWERPC_64LE-NEXT:    li r3, 0
 ; POWERPC_64LE-NEXT:    vcmpequh v2, v2, v3
 ; POWERPC_64LE-NEXT:    vspltisw v3, 1
 ; POWERPC_64LE-NEXT:    xxlnor v2, v2, v2
 ; POWERPC_64LE-NEXT:    vmrglh v2, v2, v2
 ; POWERPC_64LE-NEXT:    xxland v2, v2, v3
 ; POWERPC_64LE-NEXT:    xxswapd v3, v2
 ; POWERPC_64LE-NEXT:    vadduwm v2, v2, v3
 ; POWERPC_64LE-NEXT:    xxspltw v3, v2, 2
 ; POWERPC_64LE-NEXT:    vadduwm v2, v2, v3
 ; POWERPC_64LE-NEXT:    vextuwrx r3, r3, v2
 ; POWERPC_64LE-NEXT:    blr
 ;
 ; POWERPC_64-LABEL: cols_needed:
 ; POWERPC_64:       # %bb.0: # %entry
 ; POWERPC_64-NEXT:    xxlxor v3, v3, v3
 ; POWERPC_64-NEXT:    li r3, 0
 ; POWERPC_64-NEXT:    vcmpequh v2, v2, v3
 ; POWERPC_64-NEXT:    vspltisw v3, 1
 ; POWERPC_64-NEXT:    xxlnor v2, v2, v2
 ; POWERPC_64-NEXT:    vmrghh v2, v2, v2
 ; POWERPC_64-NEXT:    xxland v2, v2, v3
 ; POWERPC_64-NEXT:    xxswapd v3, v2
 ; POWERPC_64-NEXT:    vadduwm v2, v2, v3
 ; POWERPC_64-NEXT:    xxspltw v3, v2, 1
 ; POWERPC_64-NEXT:    vadduwm v2, v2, v3
 ; POWERPC_64-NEXT:    vextuwlx r3, r3, v2
 ; POWERPC_64-NEXT:    blr
 ;
 ; POWERPC_32-LABEL: cols_needed:
 ; POWERPC_32:       # %bb.0: # %entry
 ; POWERPC_32-NEXT:    xxlxor v3, v3, v3
 ; POWERPC_32-NEXT:    vcmpequh v2, v2, v3
 ; POWERPC_32-NEXT:    vspltisw v3, 1
 ; POWERPC_32-NEXT:    xxlnor v2, v2, v2
 ; POWERPC_32-NEXT:    vmrghh v2, v2, v2
 ; POWERPC_32-NEXT:    xxland v2, v2, v3
 ; POWERPC_32-NEXT:    xxswapd v3, v2
 ; POWERPC_32-NEXT:    vadduwm v2, v2, v3
 ; POWERPC_32-NEXT:    xxspltw v3, v2, 1
 ; POWERPC_32-NEXT:    vadduwm v2, v2, v3
 ; POWERPC_32-NEXT:    stxv v2, -16(r1)
 ; POWERPC_32-NEXT:    lwz r3, -16(r1)
 ; POWERPC_32-NEXT:    blr
 entry:
   %0 = icmp ne <4 x i16> %wide.load, zeroinitializer
   %1 = zext <4 x i1> %0 to <4 x i32>
   %2 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
   ret i32 %2
 }

 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #0

 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
	; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
	; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s \| FileCheck %s --check-prefix=POWERPC_64LE

	; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \
	; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s \| FileCheck %s --check-prefix=POWERPC_64

	; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc-ibm-aix \
	; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s \| FileCheck %s --check-prefix=POWERPC_32

	; The current implementation is comparing vector of non-zeros in register v2 with v3. v3 is then negated and converts:
	; 0XFFFF -> 0
	; 0 -> 1
	; An optimized version is to follow this NFC patch

	define i32 @cols_needed(<4 x i16> %wide.load) {
	; POWERPC_64LE-LABEL: cols_needed:
	; POWERPC_64LE: # %bb.0: # %entry
	; POWERPC_64LE-NEXT: xxlxor v3, v3, v3
	; POWERPC_64LE-NEXT: li r3, 0
	; POWERPC_64LE-NEXT: vcmpequh v2, v2, v3
	; POWERPC_64LE-NEXT: vspltisw v3, 1
	; POWERPC_64LE-NEXT: xxlnor v2, v2, v2
	; POWERPC_64LE-NEXT: vmrglh v2, v2, v2
	; POWERPC_64LE-NEXT: xxland v2, v2, v3
	; POWERPC_64LE-NEXT: xxswapd v3, v2
	; POWERPC_64LE-NEXT: vadduwm v2, v2, v3
	; POWERPC_64LE-NEXT: xxspltw v3, v2, 2
	; POWERPC_64LE-NEXT: vadduwm v2, v2, v3
	; POWERPC_64LE-NEXT: vextuwrx r3, r3, v2
	; POWERPC_64LE-NEXT: blr
	;
	; POWERPC_64-LABEL: cols_needed:
	; POWERPC_64: # %bb.0: # %entry
	; POWERPC_64-NEXT: xxlxor v3, v3, v3
	; POWERPC_64-NEXT: li r3, 0
	; POWERPC_64-NEXT: vcmpequh v2, v2, v3
	; POWERPC_64-NEXT: vspltisw v3, 1
	; POWERPC_64-NEXT: xxlnor v2, v2, v2
	; POWERPC_64-NEXT: vmrghh v2, v2, v2
	; POWERPC_64-NEXT: xxland v2, v2, v3
	; POWERPC_64-NEXT: xxswapd v3, v2
	; POWERPC_64-NEXT: vadduwm v2, v2, v3
	; POWERPC_64-NEXT: xxspltw v3, v2, 1
	; POWERPC_64-NEXT: vadduwm v2, v2, v3
	; POWERPC_64-NEXT: vextuwlx r3, r3, v2
	; POWERPC_64-NEXT: blr
	;
	; POWERPC_32-LABEL: cols_needed:
	; POWERPC_32: # %bb.0: # %entry
	; POWERPC_32-NEXT: xxlxor v3, v3, v3
	; POWERPC_32-NEXT: vcmpequh v2, v2, v3
	; POWERPC_32-NEXT: vspltisw v3, 1
	; POWERPC_32-NEXT: xxlnor v2, v2, v2
	; POWERPC_32-NEXT: vmrghh v2, v2, v2
	; POWERPC_32-NEXT: xxland v2, v2, v3
	; POWERPC_32-NEXT: xxswapd v3, v2
	; POWERPC_32-NEXT: vadduwm v2, v2, v3
	; POWERPC_32-NEXT: xxspltw v3, v2, 1
	; POWERPC_32-NEXT: vadduwm v2, v2, v3
	; POWERPC_32-NEXT: stxv v2, -16(r1)
	; POWERPC_32-NEXT: lwz r3, -16(r1)
	; POWERPC_32-NEXT: blr
	entry:
	%0 = icmp ne <4 x i16> %wide.load, zeroinitializer
	%1 = zext <4 x i1> %0 to <4 x i32>
	%2 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
	ret i32 %2
	}

	; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
	declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #0

	attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }