[AArch64] Add some basic patterns for qshrn. With the truncssat nodes these are relatively simple tablegen patterns to add. The existing intrinsics are converted to shift+truncsat to they can lower using the new patterns. Fixes #112925.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 03e8885..bf2f067 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5950,6 +5950,27 @@ case Intrinsic::aarch64_neon_uqxtn: return DAG.getNode(ISD::TRUNCATE_USAT_U, dl, Op.getValueType(), Op.getOperand(1)); + case Intrinsic::aarch64_neon_sqshrn: + if (Op.getValueType().isVector()) + return DAG.getNode(ISD::TRUNCATE_SSAT_S, dl, Op.getValueType(), + DAG.getNode(AArch64ISD::VASHR, dl, + Op.getOperand(1).getValueType(), + Op.getOperand(1), Op.getOperand(2))); + return SDValue(); + case Intrinsic::aarch64_neon_sqshrun: + if (Op.getValueType().isVector()) + return DAG.getNode(ISD::TRUNCATE_SSAT_U, dl, Op.getValueType(), + DAG.getNode(AArch64ISD::VASHR, dl, + Op.getOperand(1).getValueType(), + Op.getOperand(1), Op.getOperand(2))); + return SDValue(); + case Intrinsic::aarch64_neon_uqshrn: + if (Op.getValueType().isVector()) + return DAG.getNode(ISD::TRUNCATE_USAT_U, dl, Op.getValueType(), + DAG.getNode(AArch64ISD::VLSHR, dl, + Op.getOperand(1).getValueType(), + Op.getOperand(1), Op.getOperand(2))); + return SDValue(); case Intrinsic::aarch64_sve_whilelo: return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false, /*IsEqual=*/false);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 37dd43a..76a1029 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -8007,9 +8007,9 @@ defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>; defm SQSHL : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>; defm SQSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn", - int_aarch64_neon_sqshrn>; + BinOpFrag<(truncssat_s (AArch64vashr node:$LHS, node:$RHS))>>; defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun", - int_aarch64_neon_sqshrun>; + BinOpFrag<(truncssat_u (AArch64vashr node:$LHS, node:$RHS))>>; defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", AArch64vsri>; def : Pat<(v1i64 (AArch64vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))), @@ -8030,7 +8030,7 @@ int_aarch64_neon_uqrshrn>; defm UQSHL : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>; defm UQSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn", - int_aarch64_neon_uqshrn>; + BinOpFrag<(truncusat_u (AArch64vlshr node:$LHS, node:$RHS))>>; defm URSHR : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", AArch64urshri>; defm URSRA : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra", TriOpFrag<(add node:$LHS,
diff --git a/llvm/test/CodeGen/AArch64/qshrn.ll b/llvm/test/CodeGen/AArch64/qshrn.ll index eaba88d..0212ff5 100644 --- a/llvm/test/CodeGen/AArch64/qshrn.ll +++ b/llvm/test/CodeGen/AArch64/qshrn.ll
@@ -4,8 +4,7 @@ define <4 x i16> @NarrowAShrI32By5(<4 x i32> %x) { ; CHECK-LABEL: NarrowAShrI32By5: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v0.4s, v0.4s, #5 -; CHECK-NEXT: sqxtn v0.4h, v0.4s +; CHECK-NEXT: sqshrn v0.4h, v0.4s, #5 ; CHECK-NEXT: ret %s = ashr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5> %r = tail call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %s) @@ -26,8 +25,7 @@ define <4 x i16> @NarrowAShrI32By5ToU16(<4 x i32> %x) { ; CHECK-LABEL: NarrowAShrI32By5ToU16: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v0.4s, v0.4s, #5 -; CHECK-NEXT: sqxtun v0.4h, v0.4s +; CHECK-NEXT: sqshrun v0.4h, v0.4s, #5 ; CHECK-NEXT: ret %s = ashr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5> %r = tail call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %s) @@ -48,8 +46,7 @@ define <4 x i16> @NarrowLShrU32By5(<4 x i32> %x) { ; CHECK-LABEL: NarrowLShrU32By5: ; CHECK: // %bb.0: -; CHECK-NEXT: ushr v0.4s, v0.4s, #5 -; CHECK-NEXT: uqxtn v0.4h, v0.4s +; CHECK-NEXT: uqshrn v0.4h, v0.4s, #5 ; CHECK-NEXT: ret %s = lshr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5> %r = tail call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %s) @@ -71,8 +68,7 @@ define <2 x i32> @NarrowAShri64By5(<2 x i64> %x) { ; CHECK-LABEL: NarrowAShri64By5: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v0.2d, v0.2d, #5 -; CHECK-NEXT: sqxtn v0.2s, v0.2d +; CHECK-NEXT: sqshrn v0.2s, v0.2d, #5 ; CHECK-NEXT: ret %s = ashr <2 x i64> %x, <i64 5, i64 5> %r = tail call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %s) @@ -93,8 +89,7 @@ define <2 x i32> @NarrowAShri64By5ToU32(<2 x i64> %x) { ; CHECK-LABEL: NarrowAShri64By5ToU32: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v0.2d, v0.2d, #5 -; CHECK-NEXT: sqxtun v0.2s, v0.2d +; CHECK-NEXT: sqshrun v0.2s, v0.2d, #5 ; CHECK-NEXT: ret %s = ashr <2 x i64> %x, <i64 5, i64 5> %r = tail call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %s) @@ -115,8 +110,7 @@ define <2 x i32> @NarrowLShrU64By5(<2 x i64> %x) { ; CHECK-LABEL: NarrowLShrU64By5: ; CHECK: // %bb.0: -; CHECK-NEXT: ushr v0.2d, v0.2d, #5 -; CHECK-NEXT: uqxtn v0.2s, v0.2d +; CHECK-NEXT: uqshrn v0.2s, v0.2d, #5 ; CHECK-NEXT: ret %s = lshr <2 x i64> %x, <i64 5, i64 5> %r = tail call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %s) @@ -138,8 +132,7 @@ define <8 x i8> @NarrowAShri16By5(<8 x i16> %x) { ; CHECK-LABEL: NarrowAShri16By5: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v0.8h, v0.8h, #5 -; CHECK-NEXT: sqxtn v0.8b, v0.8h +; CHECK-NEXT: sqshrn v0.8b, v0.8h, #5 ; CHECK-NEXT: ret %s = ashr <8 x i16> %x, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5> %r = tail call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %s) @@ -160,8 +153,7 @@ define <8 x i8> @NarrowAShri16By5ToU8(<8 x i16> %x) { ; CHECK-LABEL: NarrowAShri16By5ToU8: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v0.8h, v0.8h, #5 -; CHECK-NEXT: sqxtun v0.8b, v0.8h +; CHECK-NEXT: sqshrun v0.8b, v0.8h, #5 ; CHECK-NEXT: ret %s = ashr <8 x i16> %x, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5> %r = tail call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %s) @@ -182,8 +174,7 @@ define <8 x i8> @NarrowLShrU16By5(<8 x i16> %x) { ; CHECK-LABEL: NarrowLShrU16By5: ; CHECK: // %bb.0: -; CHECK-NEXT: ushr v0.8h, v0.8h, #5 -; CHECK-NEXT: uqxtn v0.8b, v0.8h +; CHECK-NEXT: uqshrn v0.8b, v0.8h, #5 ; CHECK-NEXT: ret %s = lshr <8 x i16> %x, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5> %r = tail call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %s) @@ -208,8 +199,7 @@ define <4 x i16> @NarrowAShrI32By31(<4 x i32> %x) { ; CHECK-LABEL: NarrowAShrI32By31: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v0.4s, v0.4s, #16 -; CHECK-NEXT: sqxtn v0.4h, v0.4s +; CHECK-NEXT: sqshrn v0.4h, v0.4s, #16 ; CHECK-NEXT: ret %s = ashr <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16> %r = tail call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %s) @@ -219,8 +209,7 @@ define <4 x i16> @NarrowAShrI32By31ToU16(<4 x i32> %x) { ; CHECK-LABEL: NarrowAShrI32By31ToU16: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v0.4s, v0.4s, #16 -; CHECK-NEXT: sqxtun v0.4h, v0.4s +; CHECK-NEXT: sqshrun v0.4h, v0.4s, #16 ; CHECK-NEXT: ret %s = ashr <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16> %r = tail call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %s) @@ -230,8 +219,7 @@ define <4 x i16> @NarrowLShrU32By31(<4 x i32> %x) { ; CHECK-LABEL: NarrowLShrU32By31: ; CHECK: // %bb.0: -; CHECK-NEXT: ushr v0.4s, v0.4s, #16 -; CHECK-NEXT: uqxtn v0.4h, v0.4s +; CHECK-NEXT: uqshrn v0.4h, v0.4s, #16 ; CHECK-NEXT: ret %s = lshr <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16> %r = tail call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %s) @@ -242,10 +230,8 @@ define <16 x i8> @signed_minmax_v8i16_to_v16i8(<16 x i16> %x) { ; CHECK-LABEL: signed_minmax_v8i16_to_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshr v0.8h, v0.8h, #5 -; CHECK-NEXT: sshr v1.8h, v1.8h, #5 -; CHECK-NEXT: sqxtn v0.8b, v0.8h -; CHECK-NEXT: sqxtn2 v0.16b, v1.8h +; CHECK-NEXT: sqshrn v0.8b, v0.8h, #5 +; CHECK-NEXT: sqshrn2 v0.16b, v1.8h, #5 ; CHECK-NEXT: ret entry: %s = ashr <16 x i16> %x, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5> @@ -258,10 +244,8 @@ define <16 x i8> @unsigned_minmax_v8i16_to_v16i8(<16 x i16> %x) { ; CHECK-LABEL: unsigned_minmax_v8i16_to_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushr v0.8h, v0.8h, #5 -; CHECK-NEXT: ushr v1.8h, v1.8h, #5 -; CHECK-NEXT: uqxtn v0.8b, v0.8h -; CHECK-NEXT: uqxtn2 v0.16b, v1.8h +; CHECK-NEXT: uqshrn v0.8b, v0.8h, #5 +; CHECK-NEXT: uqshrn2 v0.16b, v1.8h, #5 ; CHECK-NEXT: ret entry: %s = lshr <16 x i16> %x, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5> @@ -273,10 +257,8 @@ define <16 x i8> @unsigned_signed_minmax_v8i16_to_v16i8(<16 x i16> %x) { ; CHECK-LABEL: unsigned_signed_minmax_v8i16_to_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshr v0.8h, v0.8h, #5 -; CHECK-NEXT: sshr v1.8h, v1.8h, #5 -; CHECK-NEXT: sqxtun v0.8b, v0.8h -; CHECK-NEXT: sqxtun2 v0.16b, v1.8h +; CHECK-NEXT: sqshrun v0.8b, v0.8h, #5 +; CHECK-NEXT: sqshrun2 v0.16b, v1.8h, #5 ; CHECK-NEXT: ret entry: %s = ashr <16 x i16> %x, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5> @@ -290,10 +272,8 @@ define <8 x i16> @signed_minmax_v4i32_to_v8i16(<8 x i32> %x) { ; CHECK-LABEL: signed_minmax_v4i32_to_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshr v0.4s, v0.4s, #5 -; CHECK-NEXT: sshr v1.4s, v1.4s, #5 -; CHECK-NEXT: sqxtn v0.4h, v0.4s -; CHECK-NEXT: sqxtn2 v0.8h, v1.4s +; CHECK-NEXT: sqshrn v0.4h, v0.4s, #5 +; CHECK-NEXT: sqshrn2 v0.8h, v1.4s, #5 ; CHECK-NEXT: ret entry: %s = ashr <8 x i32> %x, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> @@ -306,10 +286,8 @@ define <8 x i16> @unsigned_minmax_v4i32_to_v8i16(<8 x i32> %x) { ; CHECK-LABEL: unsigned_minmax_v4i32_to_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushr v0.4s, v0.4s, #5 -; CHECK-NEXT: ushr v1.4s, v1.4s, #5 -; CHECK-NEXT: uqxtn v0.4h, v0.4s -; CHECK-NEXT: uqxtn2 v0.8h, v1.4s +; CHECK-NEXT: uqshrn v0.4h, v0.4s, #5 +; CHECK-NEXT: uqshrn2 v0.8h, v1.4s, #5 ; CHECK-NEXT: ret entry: %s = lshr <8 x i32> %x, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> @@ -321,10 +299,8 @@ define <8 x i16> @unsigned_signed_minmax_v4i32_to_v8i16(<8 x i32> %x) { ; CHECK-LABEL: unsigned_signed_minmax_v4i32_to_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshr v0.4s, v0.4s, #5 -; CHECK-NEXT: sshr v1.4s, v1.4s, #5 -; CHECK-NEXT: sqxtun v0.4h, v0.4s -; CHECK-NEXT: sqxtun2 v0.8h, v1.4s +; CHECK-NEXT: sqshrun v0.4h, v0.4s, #5 +; CHECK-NEXT: sqshrun2 v0.8h, v1.4s, #5 ; CHECK-NEXT: ret entry: %s = ashr <8 x i32> %x, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> @@ -338,10 +314,8 @@ define <4 x i32> @signed_minmax_v4i64_to_v8i32(<4 x i64> %x) { ; CHECK-LABEL: signed_minmax_v4i64_to_v8i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshr v0.2d, v0.2d, #5 -; CHECK-NEXT: sshr v1.2d, v1.2d, #5 -; CHECK-NEXT: sqxtn v0.2s, v0.2d -; CHECK-NEXT: sqxtn2 v0.4s, v1.2d +; CHECK-NEXT: sqshrn v0.2s, v0.2d, #5 +; CHECK-NEXT: sqshrn2 v0.4s, v1.2d, #5 ; CHECK-NEXT: ret entry: %s = ashr <4 x i64> %x, <i64 5, i64 5, i64 5, i64 5> @@ -354,10 +328,8 @@ define <4 x i32> @unsigned_minmax_v4i64_to_v8i32(<4 x i64> %x) { ; CHECK-LABEL: unsigned_minmax_v4i64_to_v8i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushr v0.2d, v0.2d, #5 -; CHECK-NEXT: ushr v1.2d, v1.2d, #5 -; CHECK-NEXT: uqxtn v0.2s, v0.2d -; CHECK-NEXT: uqxtn2 v0.4s, v1.2d +; CHECK-NEXT: uqshrn v0.2s, v0.2d, #5 +; CHECK-NEXT: uqshrn2 v0.4s, v1.2d, #5 ; CHECK-NEXT: ret entry: %s = lshr <4 x i64> %x, <i64 5, i64 5, i64 5, i64 5> @@ -369,10 +341,8 @@ define <4 x i32> @unsigned_signed_minmax_v4i64_to_v8i32(<4 x i64> %x) { ; CHECK-LABEL: unsigned_signed_minmax_v4i64_to_v8i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshr v0.2d, v0.2d, #5 -; CHECK-NEXT: sshr v1.2d, v1.2d, #5 -; CHECK-NEXT: sqxtun v0.2s, v0.2d -; CHECK-NEXT: sqxtun2 v0.4s, v1.2d +; CHECK-NEXT: sqshrun v0.2s, v0.2d, #5 +; CHECK-NEXT: sqshrun2 v0.4s, v1.2d, #5 ; CHECK-NEXT: ret entry: %s = ashr <4 x i64> %x, <i64 5, i64 5, i64 5, i64 5>