[AArch64] Add a test case showing both dup and scalar_to_reg in the same function. NFC
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll index 8655bb1..cdde110 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -1365,7 +1365,72 @@ ret void } -declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) +define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %scale.coerce) { +; CHECK-SD-LABEL: cmplx_mul_combined_re_im: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: lsr x9, x0, #16 +; CHECK-SD-NEXT: adrp x8, .LCPI14_0 +; CHECK-SD-NEXT: dup v4.8h, w0 +; CHECK-SD-NEXT: dup v1.8h, w9 +; CHECK-SD-NEXT: fmov s3, w9 +; CHECK-SD-NEXT: sqneg v2.8h, v1.8h +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] +; CHECK-SD-NEXT: tbl v1.16b, { v2.16b, v3.16b }, v1.16b +; CHECK-SD-NEXT: rev32 v2.8h, v0.8h +; CHECK-SD-NEXT: sqdmull v3.4s, v0.4h, v4.4h +; CHECK-SD-NEXT: sqdmull2 v0.4s, v0.8h, v4.8h +; CHECK-SD-NEXT: sqdmlal v3.4s, v2.4h, v1.4h +; CHECK-SD-NEXT: sqdmlal2 v0.4s, v2.8h, v1.8h +; CHECK-SD-NEXT: uzp2 v0.8h, v3.8h, v0.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: cmplx_mul_combined_re_im: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: lsr w9, w0, #16 +; CHECK-GI-NEXT: adrp x8, .LCPI14_0 +; CHECK-GI-NEXT: rev32 v4.8h, v0.8h +; CHECK-GI-NEXT: dup v1.8h, w9 +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: sqneg v2.8h, v1.8h +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] +; CHECK-GI-NEXT: tbl v1.16b, { v2.16b, v3.16b }, v1.16b +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: dup v3.8h, w0 +; CHECK-GI-NEXT: sqdmull v2.4s, v2.4h, v3.4h +; CHECK-GI-NEXT: sqdmull v5.4s, v4.4h, v1.4h +; CHECK-GI-NEXT: sqdmlal v5.4s, v0.4h, v3.4h +; CHECK-GI-NEXT: sqdmlal2 v2.4s, v4.8h, v1.8h +; CHECK-GI-NEXT: uzp2 v0.8h, v5.8h, v2.8h +; CHECK-GI-NEXT: ret +entry: + %scale.sroa.0.0.extract.trunc = trunc i64 %scale.coerce to i16 + %scale.sroa.2.0.extract.shift23 = lshr i64 %scale.coerce, 16 + %scale.sroa.2.0.extract.trunc = trunc i64 %scale.sroa.2.0.extract.shift23 to i16 + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> + %vecinit.i24 = insertelement <8 x i16> poison, i16 %scale.sroa.0.0.extract.trunc, i64 0 + %vecinit.i = insertelement <8 x i16> poison, i16 %scale.sroa.2.0.extract.trunc, i64 0 + %vecinit7.i = shufflevector <8 x i16> %vecinit.i, <8 x i16> poison, <8 x i32> zeroinitializer + %vqnegq_v1.i = tail call noundef <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %vecinit7.i) + %vbsl5.i = shufflevector <8 x i16> %vqnegq_v1.i, <8 x i16> %vecinit.i, <8 x i32> <i32 0, i32 8, i32 2, i32 8, i32 4, i32 8, i32 6, i32 8> + %shuffle.i40 = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %shuffle.i39 = shufflevector <8 x i16> %vecinit.i24, <8 x i16> poison, <4 x i32> zeroinitializer + %vqdmull_v2.i36 = tail call noundef <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i40, <4 x i16> %shuffle.i39) + %shuffle.i44 = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vqdmull_v2.i = tail call noundef <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i44, <4 x i16> %shuffle.i39) + %shuffle.i38 = shufflevector <8 x i16> %shuffle.i, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %shuffle.i37 = shufflevector <8 x i16> %vbsl5.i, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %vqdmlal2.i45 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i38, <4 x i16> %shuffle.i37) + %vqdmlal_v3.i46 = tail call noundef <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %vqdmull_v2.i36, <4 x i32> %vqdmlal2.i45) + %shuffle.i42 = shufflevector <8 x i16> %shuffle.i, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle.i41 = shufflevector <8 x i16> %vbsl5.i, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i42, <4 x i16> %shuffle.i41) + %vqdmlal_v3.i = tail call noundef <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %vqdmull_v2.i, <4 x i32> %vqdmlal2.i) + %0 = bitcast <4 x i32> %vqdmlal_v3.i46 to <8 x i16> + %1 = bitcast <4 x i32> %vqdmlal_v3.i to <8 x i16> + %shuffle.i35 = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + ret <8 x i16> %shuffle.i35 +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK: {{.*}}