[AArch64] Improve scalar and Neon popcount with SVE CNT. (#143870) When available, we can use SVE's CNT instruction to improve the lowering of scalar and fixed-length popcount (CTPOP) since the SVE instruction supports types that the Neon variant doesn't. For the scalar types, I see the following speedups on NVIDIA Grace CPU: | size (bits) | before (Gibit/s) | after (Gibit/s) | speedup | |------------:|-----------------:|----------------:|--------:| | 32 | 75.20 | 86.79 | 1.15 | | 64 | 149.87 | 173.70 | 1.16 | | 128 | 158.56 | 164.88 | 1.04 |
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c8b1eaf..fb8bd81 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10683,12 +10683,9 @@ EVT VT = Op.getValueType(); if (VT.isScalableVector() || - useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) + useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU); - if (!Subtarget->isNeonAvailable()) - return SDValue(); - bool IsParity = Op.getOpcode() == ISD::PARITY; SDValue Val = Op.getOperand(0); SDLoc DL(Op); @@ -10698,6 +10695,36 @@ if (VT == MVT::i32 && IsParity) return SDValue(); + if (Subtarget->isSVEorStreamingSVEAvailable()) { + if (VT == MVT::i32 || VT == MVT::i64) { + EVT ContainerVT = VT == MVT::i32 ? MVT::nxv4i32 : MVT::nxv2i64; + Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, + DAG.getUNDEF(ContainerVT), Val, + DAG.getVectorIdxConstant(0, DL)); + Val = DAG.getNode(ISD::CTPOP, DL, ContainerVT, Val); + Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Val, + DAG.getVectorIdxConstant(0, DL)); + if (IsParity) + Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT)); + return Val; + } + + if (VT == MVT::i128) { + Val = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Val); + Val = convertToScalableVector(DAG, MVT::nxv2i64, Val); + Val = DAG.getNode(ISD::CTPOP, DL, MVT::nxv2i64, Val); + Val = convertFromScalableVector(DAG, MVT::v2i64, Val); + Val = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i64, Val); + Val = DAG.getZExtOrTrunc(Val, DL, VT); + if (IsParity) + Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT)); + return Val; + } + } + + if (!Subtarget->isNeonAvailable()) + return SDValue(); + // If there is no CNT instruction available, GPR popcount can // be more efficiently lowered to the following sequence that uses // AdvSIMD registers/instructions as long as the copies to/from
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll index c158d8a..3900910 100644 --- a/llvm/test/CodeGen/AArch64/popcount.ll +++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -23,15 +23,36 @@ ; CHECKO0-NEXT: fmov w0, s0 ; CHECKO0-NEXT: ret ; -; CHECK-LABEL: popcount128: -; CHECK: // %bb.0: // %Entry -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: add x8, x0, #8 -; CHECK-NEXT: ld1 { v0.d }[1], [x8] -; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: addv b0, v0.16b -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: ret +; NEON-LABEL: popcount128: +; NEON: // %bb.0: // %Entry +; NEON-NEXT: ldr d0, [x0] +; NEON-NEXT: add x8, x0, #8 +; NEON-NEXT: ld1 { v0.d }[1], [x8] +; NEON-NEXT: cnt v0.16b, v0.16b +; NEON-NEXT: addv b0, v0.16b +; NEON-NEXT: fmov w0, s0 +; NEON-NEXT: ret +; +; DOT-LABEL: popcount128: +; DOT: // %bb.0: // %Entry +; DOT-NEXT: ldr d0, [x0] +; DOT-NEXT: add x8, x0, #8 +; DOT-NEXT: ld1 { v0.d }[1], [x8] +; DOT-NEXT: cnt v0.16b, v0.16b +; DOT-NEXT: addv b0, v0.16b +; DOT-NEXT: fmov w0, s0 +; DOT-NEXT: ret +; +; SVE-LABEL: popcount128: +; SVE: // %bb.0: // %Entry +; SVE-NEXT: ldr d0, [x0] +; SVE-NEXT: add x8, x0, #8 +; SVE-NEXT: ptrue p0.d +; SVE-NEXT: ld1 { v0.d }[1], [x8] +; SVE-NEXT: cnt z0.d, p0/m, z0.d +; SVE-NEXT: addp d0, v0.2d +; SVE-NEXT: fmov w0, s0 +; SVE-NEXT: ret ; ; BE-LABEL: popcount128: ; BE: // %bb.0: // %Entry @@ -107,22 +128,55 @@ ; CHECKO0-NEXT: mov w0, w8 ; CHECKO0-NEXT: ret ; -; CHECK-LABEL: popcount256: -; CHECK: // %bb.0: // %Entry -; CHECK-NEXT: ldr d0, [x0, #16] -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: add x8, x0, #8 -; CHECK-NEXT: add x9, x0, #24 -; CHECK-NEXT: ld1 { v0.d }[1], [x9] -; CHECK-NEXT: ld1 { v1.d }[1], [x8] -; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: cnt v1.16b, v1.16b -; CHECK-NEXT: addv b0, v0.16b -; CHECK-NEXT: addv b1, v1.16b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: add w0, w9, w8 -; CHECK-NEXT: ret +; NEON-LABEL: popcount256: +; NEON: // %bb.0: // %Entry +; NEON-NEXT: ldr d0, [x0, #16] +; NEON-NEXT: ldr d1, [x0] +; NEON-NEXT: add x8, x0, #8 +; NEON-NEXT: add x9, x0, #24 +; NEON-NEXT: ld1 { v0.d }[1], [x9] +; NEON-NEXT: ld1 { v1.d }[1], [x8] +; NEON-NEXT: cnt v0.16b, v0.16b +; NEON-NEXT: cnt v1.16b, v1.16b +; NEON-NEXT: addv b0, v0.16b +; NEON-NEXT: addv b1, v1.16b +; NEON-NEXT: fmov w8, s0 +; NEON-NEXT: fmov w9, s1 +; NEON-NEXT: add w0, w9, w8 +; NEON-NEXT: ret +; +; DOT-LABEL: popcount256: +; DOT: // %bb.0: // %Entry +; DOT-NEXT: ldr d0, [x0, #16] +; DOT-NEXT: ldr d1, [x0] +; DOT-NEXT: add x8, x0, #8 +; DOT-NEXT: add x9, x0, #24 +; DOT-NEXT: ld1 { v0.d }[1], [x9] +; DOT-NEXT: ld1 { v1.d }[1], [x8] +; DOT-NEXT: cnt v0.16b, v0.16b +; DOT-NEXT: cnt v1.16b, v1.16b +; DOT-NEXT: addv b0, v0.16b +; DOT-NEXT: addv b1, v1.16b +; DOT-NEXT: fmov w8, s0 +; DOT-NEXT: fmov w9, s1 +; DOT-NEXT: add w0, w9, w8 +; DOT-NEXT: ret +; +; SVE-LABEL: popcount256: +; SVE: // %bb.0: // %Entry +; SVE-NEXT: ldr d0, [x0, #16] +; SVE-NEXT: ldr d1, [x0] +; SVE-NEXT: add x8, x0, #8 +; SVE-NEXT: add x9, x0, #24 +; SVE-NEXT: ptrue p0.d +; SVE-NEXT: ld1 { v0.d }[1], [x9] +; SVE-NEXT: ld1 { v1.d }[1], [x8] +; SVE-NEXT: cnt z0.d, p0/m, z0.d +; SVE-NEXT: cnt z1.d, p0/m, z1.d +; SVE-NEXT: add v0.2d, v1.2d, v0.2d +; SVE-NEXT: addp d0, v0.2d +; SVE-NEXT: fmov w0, s0 +; SVE-NEXT: ret ; ; BE-LABEL: popcount256: ; BE: // %bb.0: // %Entry @@ -223,15 +277,36 @@ ; CHECKO0-NEXT: bfi x0, x8, #32, #32 ; CHECKO0-NEXT: ret ; -; CHECK-LABEL: popcount1x128: -; CHECK: // %bb.0: // %Entry -; CHECK-NEXT: fmov d0, x0 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: mov x1, xzr -; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: addv b0, v0.16b -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: ret +; NEON-LABEL: popcount1x128: +; NEON: // %bb.0: // %Entry +; NEON-NEXT: fmov d0, x0 +; NEON-NEXT: mov v0.d[1], x1 +; NEON-NEXT: mov x1, xzr +; NEON-NEXT: cnt v0.16b, v0.16b +; NEON-NEXT: addv b0, v0.16b +; NEON-NEXT: fmov x0, d0 +; NEON-NEXT: ret +; +; DOT-LABEL: popcount1x128: +; DOT: // %bb.0: // %Entry +; DOT-NEXT: fmov d0, x0 +; DOT-NEXT: mov v0.d[1], x1 +; DOT-NEXT: mov x1, xzr +; DOT-NEXT: cnt v0.16b, v0.16b +; DOT-NEXT: addv b0, v0.16b +; DOT-NEXT: fmov x0, d0 +; DOT-NEXT: ret +; +; SVE-LABEL: popcount1x128: +; SVE: // %bb.0: // %Entry +; SVE-NEXT: fmov d0, x0 +; SVE-NEXT: ptrue p0.d +; SVE-NEXT: mov v0.d[1], x1 +; SVE-NEXT: mov x1, xzr +; SVE-NEXT: cnt z0.d, p0/m, z0.d +; SVE-NEXT: addp d0, v0.2d +; SVE-NEXT: fmov x0, d0 +; SVE-NEXT: ret ; ; BE-LABEL: popcount1x128: ; BE: // %bb.0: // %Entry @@ -305,10 +380,10 @@ ; ; SVE-LABEL: popcount2x64: ; SVE: // %bb.0: // %Entry -; SVE-NEXT: cnt v0.16b, v0.16b -; SVE-NEXT: uaddlp v0.8h, v0.16b -; SVE-NEXT: uaddlp v0.4s, v0.8h -; SVE-NEXT: uaddlp v0.2d, v0.4s +; SVE-NEXT: ptrue p0.d, vl2 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: cnt z0.d, p0/m, z0.d +; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SVE-NEXT: ret ; ; BE-LABEL: popcount2x64: @@ -374,13 +449,29 @@ ; CHECKO0-NEXT: fmov d0, x8 ; CHECKO0-NEXT: ret ; -; CHECK-LABEL: popcount1x64: -; CHECK: // %bb.0: // %Entry -; CHECK-NEXT: cnt v0.8b, v0.8b -; CHECK-NEXT: uaddlp v0.4h, v0.8b -; CHECK-NEXT: uaddlp v0.2s, v0.4h -; CHECK-NEXT: uaddlp v0.1d, v0.2s -; CHECK-NEXT: ret +; NEON-LABEL: popcount1x64: +; NEON: // %bb.0: // %Entry +; NEON-NEXT: cnt v0.8b, v0.8b +; NEON-NEXT: uaddlp v0.4h, v0.8b +; NEON-NEXT: uaddlp v0.2s, v0.4h +; NEON-NEXT: uaddlp v0.1d, v0.2s +; NEON-NEXT: ret +; +; DOT-LABEL: popcount1x64: +; DOT: // %bb.0: // %Entry +; DOT-NEXT: cnt v0.8b, v0.8b +; DOT-NEXT: uaddlp v0.4h, v0.8b +; DOT-NEXT: uaddlp v0.2s, v0.4h +; DOT-NEXT: uaddlp v0.1d, v0.2s +; DOT-NEXT: ret +; +; SVE-LABEL: popcount1x64: +; SVE: // %bb.0: // %Entry +; SVE-NEXT: ptrue p0.d, vl1 +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: cnt z0.d, p0/m, z0.d +; SVE-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE-NEXT: ret ; ; BE-LABEL: popcount1x64: ; BE: // %bb.0: // %Entry @@ -442,9 +533,10 @@ ; ; SVE-LABEL: popcount4x32: ; SVE: // %bb.0: // %Entry -; SVE-NEXT: cnt v0.16b, v0.16b -; SVE-NEXT: uaddlp v0.8h, v0.16b -; SVE-NEXT: uaddlp v0.4s, v0.8h +; SVE-NEXT: ptrue p0.s, vl4 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: cnt z0.s, p0/m, z0.s +; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SVE-NEXT: ret ; ; BE-LABEL: popcount4x32: @@ -520,9 +612,10 @@ ; ; SVE-LABEL: popcount2x32: ; SVE: // %bb.0: // %Entry -; SVE-NEXT: cnt v0.8b, v0.8b -; SVE-NEXT: uaddlp v0.4h, v0.8b -; SVE-NEXT: uaddlp v0.2s, v0.4h +; SVE-NEXT: ptrue p0.s, vl2 +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: cnt z0.s, p0/m, z0.s +; SVE-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE-NEXT: ret ; ; BE-LABEL: popcount2x32: @@ -577,11 +670,25 @@ ; CHECKO0-NEXT: uaddlp v0.8h, v0.16b ; CHECKO0-NEXT: ret ; -; CHECK-LABEL: popcount8x16: -; CHECK: // %bb.0: // %Entry -; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b -; CHECK-NEXT: ret +; NEON-LABEL: popcount8x16: +; NEON: // %bb.0: // %Entry +; NEON-NEXT: cnt v0.16b, v0.16b +; NEON-NEXT: uaddlp v0.8h, v0.16b +; NEON-NEXT: ret +; +; DOT-LABEL: popcount8x16: +; DOT: // %bb.0: // %Entry +; DOT-NEXT: cnt v0.16b, v0.16b +; DOT-NEXT: uaddlp v0.8h, v0.16b +; DOT-NEXT: ret +; +; SVE-LABEL: popcount8x16: +; SVE: // %bb.0: // %Entry +; SVE-NEXT: ptrue p0.h, vl8 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: cnt z0.h, p0/m, z0.h +; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE-NEXT: ret ; ; BE-LABEL: popcount8x16: ; BE: // %bb.0: // %Entry @@ -618,11 +725,25 @@ ; CHECKO0-NEXT: uaddlp v0.4h, v0.8b ; CHECKO0-NEXT: ret ; -; CHECK-LABEL: popcount4x16: -; CHECK: // %bb.0: // %Entry -; CHECK-NEXT: cnt v0.8b, v0.8b -; CHECK-NEXT: uaddlp v0.4h, v0.8b -; CHECK-NEXT: ret +; NEON-LABEL: popcount4x16: +; NEON: // %bb.0: // %Entry +; NEON-NEXT: cnt v0.8b, v0.8b +; NEON-NEXT: uaddlp v0.4h, v0.8b +; NEON-NEXT: ret +; +; DOT-LABEL: popcount4x16: +; DOT: // %bb.0: // %Entry +; DOT-NEXT: cnt v0.8b, v0.8b +; DOT-NEXT: uaddlp v0.4h, v0.8b +; DOT-NEXT: ret +; +; SVE-LABEL: popcount4x16: +; SVE: // %bb.0: // %Entry +; SVE-NEXT: ptrue p0.h, vl4 +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: cnt z0.h, p0/m, z0.h +; SVE-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE-NEXT: ret ; ; BE-LABEL: popcount4x16: ; BE: // %bb.0: // %Entry @@ -673,20 +794,49 @@ ; CHECKO0-NEXT: mov w0, wzr ; CHECKO0-NEXT: ret ; -; CHECK-LABEL: ctpop_into_extract: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: movi v2.2d, #0xffffffffffffffff -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: cnt v1.8b, v1.8b -; CHECK-NEXT: addv b1, v1.8b -; CHECK-NEXT: mov v2.s[1], v1.s[0] -; CHECK-NEXT: sub v0.2s, v0.2s, v2.2s -; CHECK-NEXT: str d0, [x8] -; CHECK-NEXT: ret +; NEON-LABEL: ctpop_into_extract: +; NEON: // %bb.0: +; NEON-NEXT: ldr d0, [x0] +; NEON-NEXT: movi v2.2d, #0xffffffffffffffff +; NEON-NEXT: mov x8, x0 +; NEON-NEXT: mov w0, wzr +; NEON-NEXT: fmov w9, s0 +; NEON-NEXT: fmov s1, w9 +; NEON-NEXT: cnt v1.8b, v1.8b +; NEON-NEXT: addv b1, v1.8b +; NEON-NEXT: mov v2.s[1], v1.s[0] +; NEON-NEXT: sub v0.2s, v0.2s, v2.2s +; NEON-NEXT: str d0, [x8] +; NEON-NEXT: ret +; +; DOT-LABEL: ctpop_into_extract: +; DOT: // %bb.0: +; DOT-NEXT: ldr d0, [x0] +; DOT-NEXT: movi v2.2d, #0xffffffffffffffff +; DOT-NEXT: mov x8, x0 +; DOT-NEXT: mov w0, wzr +; DOT-NEXT: fmov w9, s0 +; DOT-NEXT: fmov s1, w9 +; DOT-NEXT: cnt v1.8b, v1.8b +; DOT-NEXT: addv b1, v1.8b +; DOT-NEXT: mov v2.s[1], v1.s[0] +; DOT-NEXT: sub v0.2s, v0.2s, v2.2s +; DOT-NEXT: str d0, [x8] +; DOT-NEXT: ret +; +; SVE-LABEL: ctpop_into_extract: +; SVE: // %bb.0: +; SVE-NEXT: ldr d0, [x0] +; SVE-NEXT: ptrue p0.s +; SVE-NEXT: movi v2.2d, #0xffffffffffffffff +; SVE-NEXT: mov x8, x0 +; SVE-NEXT: mov w0, wzr +; SVE-NEXT: movprfx z1, z0 +; SVE-NEXT: cnt z1.s, p0/m, z0.s +; SVE-NEXT: mov v2.s[1], v1.s[0] +; SVE-NEXT: sub v0.2s, v0.2s, v2.2s +; SVE-NEXT: str d0, [x8] +; SVE-NEXT: ret ; ; BE-LABEL: ctpop_into_extract: ; BE: // %bb.0: @@ -752,3 +902,5 @@ } declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll index 1e71c4b..b62b850 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll
@@ -457,23 +457,25 @@ ret void } -; Don't use SVE for 64-bit vectors. define <4 x i16> @ctpop_v4i16(<4 x i16> %op) vscale_range(2,0) #0 { ; CHECK-LABEL: ctpop_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: cnt v0.8b, v0.8b -; CHECK-NEXT: uaddlp v0.4h, v0.8b +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: cnt z0.h, p0/m, z0.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %op) ret <4 x i16> %res } -; Don't use SVE for 128-bit vectors. define <8 x i16> @ctpop_v8i16(<8 x i16> %op) vscale_range(2,0) #0 { ; CHECK-LABEL: ctpop_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: cnt z0.h, p0/m, z0.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %op) ret <8 x i16> %res @@ -547,25 +549,25 @@ ret void } -; Don't use SVE for 64-bit vectors. define <2 x i32> @ctpop_v2i32(<2 x i32> %op) vscale_range(2,0) #0 { ; CHECK-LABEL: ctpop_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: cnt v0.8b, v0.8b -; CHECK-NEXT: uaddlp v0.4h, v0.8b -; CHECK-NEXT: uaddlp v0.2s, v0.4h +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: cnt z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %op) ret <2 x i32> %res } -; Don't use SVE for 128-bit vectors. define <4 x i32> @ctpop_v4i32(<4 x i32> %op) vscale_range(2,0) #0 { ; CHECK-LABEL: ctpop_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b -; CHECK-NEXT: uaddlp v0.4s, v0.8h +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: cnt z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %op) ret <4 x i32> %res @@ -639,27 +641,25 @@ ret void } -; Don't use SVE for 64-bit vectors. define <1 x i64> @ctpop_v1i64(<1 x i64> %op) vscale_range(2,0) #0 { ; CHECK-LABEL: ctpop_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: cnt v0.8b, v0.8b -; CHECK-NEXT: uaddlp v0.4h, v0.8b -; CHECK-NEXT: uaddlp v0.2s, v0.4h -; CHECK-NEXT: uaddlp v0.1d, v0.2s +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: cnt z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %op) ret <1 x i64> %res } -; Don't use SVE for 128-bit vectors. define <2 x i64> @ctpop_v2i64(<2 x i64> %op) vscale_range(2,0) #0 { ; CHECK-LABEL: ctpop_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b -; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: uaddlp v0.2d, v0.4s +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: cnt z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %op) ret <2 x i64> %res