blob: 4a3687d3a9fd409b0ba89c91066caaef25fe5f13 [file] [log] [blame] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-none-elf -mattr=+aes < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
; RUN: llc -mtriple=aarch64-none-elf -mattr=+aes -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
; CHECK-GI: warning: Instruction selection used fallback path for sqdmulh_1s
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_2s
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_4s
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_2d
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_2s
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_4s
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_2d
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2s
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_4s
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2d
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2s_strict
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_4s_strict
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2d_strict
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_2s_strict
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_4s_strict
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_2d_strict
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmulh_lane_1s
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlal_lane_1d
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlsl_lane_1d
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v4f32
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f32
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f64
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32_1
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32_1
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f64
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlal_d
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlsl_d
define <8 x i16> @smull8h(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: smull8h:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
; CHECK-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp2 = load <8 x i8>, ptr %B
%tmp3 = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @smull4s(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: smull4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @smull2d(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: smull2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define void @commutable_smull(<2 x i32> %A, <2 x i32> %B, ptr %C) {
; CHECK-LABEL: commutable_smull:
; CHECK: // %bb.0:
; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
; CHECK-NEXT: stp q0, q0, [x0]
; CHECK-NEXT: ret
%1 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %B)
%2 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %B, <2 x i32> %A)
store <2 x i64> %1, ptr %C
%3 = getelementptr i8, ptr %C, i64 16
store <2 x i64> %2, ptr %3
ret void
}
declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
define <8 x i16> @umull8h(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: umull8h:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
; CHECK-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp2 = load <8 x i8>, ptr %B
%tmp3 = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @umull4s(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: umull4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @umull2d(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: umull2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define void @commutable_umull(<2 x i32> %A, <2 x i32> %B, ptr %C) {
; CHECK-LABEL: commutable_umull:
; CHECK: // %bb.0:
; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
; CHECK-NEXT: stp q0, q0, [x0]
; CHECK-NEXT: ret
%1 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %B)
%2 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %B, <2 x i32> %A)
store <2 x i64> %1, ptr %C
%3 = getelementptr i8, ptr %C, i64 16
store <2 x i64> %2, ptr %3
ret void
}
declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
define <4 x i32> @sqdmull4s(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: sqdmull4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: sqdmull v0.4s, v0.4h, v1.4h
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @sqdmull2d(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: sqdmull2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: sqdmull v0.2d, v0.2s, v1.2s
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <4 x i32> @sqdmull2_4s(ptr %A, ptr %B) nounwind {
; CHECK-SD-LABEL: sqdmull2_4s:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: ldr d0, [x0, #8]
; CHECK-SD-NEXT: ldr d1, [x1, #8]
; CHECK-SD-NEXT: sqdmull v0.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmull2_4s:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: ldr q0, [x0]
; CHECK-GI-NEXT: ldr q1, [x1]
; CHECK-GI-NEXT: sqdmull2 v0.4s, v0.8h, v1.8h
; CHECK-GI-NEXT: ret
%load1 = load <8 x i16>, ptr %A
%load2 = load <8 x i16>, ptr %B
%tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @sqdmull2_2d(ptr %A, ptr %B) nounwind {
; CHECK-SD-LABEL: sqdmull2_2d:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: ldr d0, [x0, #8]
; CHECK-SD-NEXT: ldr d1, [x1, #8]
; CHECK-SD-NEXT: sqdmull v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmull2_2d:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: ldr q0, [x0]
; CHECK-GI-NEXT: ldr q1, [x1]
; CHECK-GI-NEXT: sqdmull2 v0.2d, v0.4s, v1.4s
; CHECK-GI-NEXT: ret
%load1 = load <4 x i32>, ptr %A
%load2 = load <4 x i32>, ptr %B
%tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
define <8 x i16> @pmull8h(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: pmull8h:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: pmull v0.8h, v0.8b, v1.8b
; CHECK-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp2 = load <8 x i8>, ptr %B
%tmp3 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define void @commutable_pmull8h(<8 x i8> %A, <8 x i8> %B, ptr %C) {
; CHECK-LABEL: commutable_pmull8h:
; CHECK: // %bb.0:
; CHECK-NEXT: pmull v0.8h, v0.8b, v1.8b
; CHECK-NEXT: stp q0, q0, [x0]
; CHECK-NEXT: ret
%1 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %A, <8 x i8> %B)
%2 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %B, <8 x i8> %A)
store <8 x i16> %1, ptr %C
%3 = getelementptr i8, ptr %C, i8 16
store <8 x i16> %2, ptr %3
ret void
}
declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
define <4 x i16> @sqdmulh_4h(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: sqdmulh_4h:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: sqdmulh v0.4h, v0.4h, v1.4h
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <8 x i16> @sqdmulh_8h(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: sqdmulh_8h:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: sqdmulh v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp2 = load <8 x i16>, ptr %B
%tmp3 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <2 x i32> @sqdmulh_2s(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: sqdmulh_2s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: sqdmulh v0.2s, v0.2s, v1.2s
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <4 x i32> @sqdmulh_4s(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: sqdmulh_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: sqdmulh v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp2 = load <4 x i32>, ptr %B
%tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define i32 @sqdmulh_1s(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: sqdmulh_1s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ldr s1, [x1]
; CHECK-NEXT: sqdmulh s0, s0, s1
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%tmp1 = load i32, ptr %A
%tmp2 = load i32, ptr %B
%tmp3 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2)
ret i32 %tmp3
}
declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare i32 @llvm.aarch64.neon.sqdmulh.i32(i32, i32) nounwind readnone
define <4 x i16> @sqrdmulh_4h(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: sqrdmulh_4h:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.4h
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <8 x i16> @sqrdmulh_8h(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: sqrdmulh_8h:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp2 = load <8 x i16>, ptr %B
%tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <2 x i32> @sqrdmulh_2s(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: sqrdmulh_2s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.2s
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <4 x i32> @sqrdmulh_4s(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: sqrdmulh_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp2 = load <4 x i32>, ptr %B
%tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define i32 @sqrdmulh_1s(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: sqrdmulh_1s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ldr s1, [x1]
; CHECK-NEXT: sqrdmulh s0, s0, s1
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%tmp1 = load i32, ptr %A
%tmp2 = load i32, ptr %B
%tmp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
ret i32 %tmp3
}
declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
define <2 x float> @fmulx_2s(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: fmulx_2s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: fmulx v0.2s, v0.2s, v1.2s
; CHECK-NEXT: ret
%tmp1 = load <2 x float>, ptr %A
%tmp2 = load <2 x float>, ptr %B
%tmp3 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x float> %tmp3
}
define <4 x float> @fmulx_4s(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: fmulx_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: fmulx v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
%tmp1 = load <4 x float>, ptr %A
%tmp2 = load <4 x float>, ptr %B
%tmp3 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
ret <4 x float> %tmp3
}
define <2 x double> @fmulx_2d(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: fmulx_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: fmulx v0.2d, v0.2d, v1.2d
; CHECK-NEXT: ret
%tmp1 = load <2 x double>, ptr %A
%tmp2 = load <2 x double>, ptr %B
%tmp3 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
ret <2 x double> %tmp3
}
declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
define <4 x i32> @smlal4s(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: smlal4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x0]
; CHECK-NEXT: ldr d2, [x1]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = load <4 x i32>, ptr %C
%tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
%tmp5 = add <4 x i32> %tmp3, %tmp4
ret <4 x i32> %tmp5
}
define <2 x i64> @smlal2d(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: smlal2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x0]
; CHECK-NEXT: ldr d2, [x1]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = load <2 x i64>, ptr %C
%tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
%tmp5 = add <2 x i64> %tmp3, %tmp4
ret <2 x i64> %tmp5
}
define void @smlal8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
; CHECK-SD-LABEL: smlal8h_chain_with_constant:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: movi v3.16b, #1
; CHECK-SD-NEXT: smlal v3.8h, v0.8b, v2.8b
; CHECK-SD-NEXT: mvn v0.8b, v2.8b
; CHECK-SD-NEXT: smlal v3.8h, v1.8b, v0.8b
; CHECK-SD-NEXT: str q3, [x0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: smlal8h_chain_with_constant:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mvn v3.8b, v2.8b
; CHECK-GI-NEXT: smull v1.8h, v1.8b, v3.8b
; CHECK-GI-NEXT: movi v3.16b, #1
; CHECK-GI-NEXT: smlal v1.8h, v0.8b, v2.8b
; CHECK-GI-NEXT: add v0.8h, v1.8h, v3.8h
; CHECK-GI-NEXT: str q0, [x0]
; CHECK-GI-NEXT: ret
%xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%smull.1 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v1, <8 x i8> %v3)
%add.1 = add <8 x i16> %smull.1, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
%smull.2 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v2, <8 x i8> %xor)
%add.2 = add <8 x i16> %add.1, %smull.2
store <8 x i16> %add.2, ptr %dst
ret void
}
define void @smlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
; CHECK-SD-LABEL: smlal2d_chain_with_constant:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: mov w8, #257 // =0x101
; CHECK-SD-NEXT: dup v3.2d, x8
; CHECK-SD-NEXT: smlal v3.2d, v0.2s, v2.2s
; CHECK-SD-NEXT: mvn v0.8b, v2.8b
; CHECK-SD-NEXT: smlal v3.2d, v1.2s, v0.2s
; CHECK-SD-NEXT: str q3, [x0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: smlal2d_chain_with_constant:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mvn v3.8b, v2.8b
; CHECK-GI-NEXT: adrp x8, .LCPI30_0
; CHECK-GI-NEXT: smull v1.2d, v1.2s, v3.2s
; CHECK-GI-NEXT: smlal v1.2d, v0.2s, v2.2s
; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI30_0]
; CHECK-GI-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-NEXT: str q0, [x0]
; CHECK-GI-NEXT: ret
%xor = xor <2 x i32> %v3, <i32 -1, i32 -1>
%smull.1 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v1, <2 x i32> %v3)
%add.1 = add <2 x i64> %smull.1, <i64 257, i64 257>
%smull.2 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v2, <2 x i32> %xor)
%add.2 = add <2 x i64> %add.1, %smull.2
store <2 x i64> %add.2, ptr %dst
ret void
}
define <4 x i32> @smlsl4s(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: smlsl4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x0]
; CHECK-NEXT: ldr d2, [x1]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.4h
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = load <4 x i32>, ptr %C
%tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
%tmp5 = sub <4 x i32> %tmp3, %tmp4
ret <4 x i32> %tmp5
}
define <2 x i64> @smlsl2d(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: smlsl2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x0]
; CHECK-NEXT: ldr d2, [x1]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.2s
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = load <2 x i64>, ptr %C
%tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
%tmp5 = sub <2 x i64> %tmp3, %tmp4
ret <2 x i64> %tmp5
}
define void @smlsl8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
; CHECK-LABEL: smlsl8h_chain_with_constant:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v3.16b, #1
; CHECK-NEXT: smlsl v3.8h, v0.8b, v2.8b
; CHECK-NEXT: mvn v0.8b, v2.8b
; CHECK-NEXT: smlsl v3.8h, v1.8b, v0.8b
; CHECK-NEXT: str q3, [x0]
; CHECK-NEXT: ret
%xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%smull.1 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v1, <8 x i8> %v3)
%sub.1 = sub <8 x i16> <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>, %smull.1
%smull.2 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v2, <8 x i8> %xor)
%sub.2 = sub <8 x i16> %sub.1, %smull.2
store <8 x i16> %sub.2, ptr %dst
ret void
}
define void @smlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
; CHECK-SD-LABEL: smlsl2d_chain_with_constant:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: mov w8, #257 // =0x101
; CHECK-SD-NEXT: dup v3.2d, x8
; CHECK-SD-NEXT: smlsl v3.2d, v0.2s, v2.2s
; CHECK-SD-NEXT: mvn v0.8b, v2.8b
; CHECK-SD-NEXT: smlsl v3.2d, v1.2s, v0.2s
; CHECK-SD-NEXT: str q3, [x0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: smlsl2d_chain_with_constant:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: adrp x8, .LCPI34_0
; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI34_0]
; CHECK-GI-NEXT: smlsl v3.2d, v0.2s, v2.2s
; CHECK-GI-NEXT: mvn v0.8b, v2.8b
; CHECK-GI-NEXT: smlsl v3.2d, v1.2s, v0.2s
; CHECK-GI-NEXT: str q3, [x0]
; CHECK-GI-NEXT: ret
%xor = xor <2 x i32> %v3, <i32 -1, i32 -1>
%smull.1 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v1, <2 x i32> %v3)
%sub.1 = sub <2 x i64> <i64 257, i64 257>, %smull.1
%smull.2 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v2, <2 x i32> %xor)
%sub.2 = sub <2 x i64> %sub.1, %smull.2
store <2 x i64> %sub.2, ptr %dst
ret void
}
declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
define <4 x i32> @sqdmlal4s(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: sqdmlal4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x0]
; CHECK-NEXT: ldr d2, [x1]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: sqdmlal v0.4s, v1.4h, v2.4h
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = load <4 x i32>, ptr %C
%tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
%tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
ret <4 x i32> %tmp5
}
define <2 x i64> @sqdmlal2d(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: sqdmlal2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x0]
; CHECK-NEXT: ldr d2, [x1]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: sqdmlal v0.2d, v1.2s, v2.2s
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = load <2 x i64>, ptr %C
%tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
%tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
ret <2 x i64> %tmp5
}
define <4 x i32> @sqdmlal2_4s(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-SD-LABEL: sqdmlal2_4s:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: ldr q0, [x2]
; CHECK-SD-NEXT: ldr d1, [x0, #8]
; CHECK-SD-NEXT: ldr d2, [x1, #8]
; CHECK-SD-NEXT: sqdmlal v0.4s, v1.4h, v2.4h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmlal2_4s:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: ldr q1, [x0]
; CHECK-GI-NEXT: ldr q2, [x1]
; CHECK-GI-NEXT: ldr q0, [x2]
; CHECK-GI-NEXT: sqdmlal2 v0.4s, v1.8h, v2.8h
; CHECK-GI-NEXT: ret
%load1 = load <8 x i16>, ptr %A
%load2 = load <8 x i16>, ptr %B
%tmp3 = load <4 x i32>, ptr %C
%tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
%tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
ret <4 x i32> %tmp5
}
define <2 x i64> @sqdmlal2_2d(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-SD-LABEL: sqdmlal2_2d:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: ldr q0, [x2]
; CHECK-SD-NEXT: ldr d1, [x0, #8]
; CHECK-SD-NEXT: ldr d2, [x1, #8]
; CHECK-SD-NEXT: sqdmlal v0.2d, v1.2s, v2.2s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmlal2_2d:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: ldr q1, [x0]
; CHECK-GI-NEXT: ldr q2, [x1]
; CHECK-GI-NEXT: ldr q0, [x2]
; CHECK-GI-NEXT: sqdmlal2 v0.2d, v1.4s, v2.4s
; CHECK-GI-NEXT: ret
%load1 = load <4 x i32>, ptr %A
%load2 = load <4 x i32>, ptr %B
%tmp3 = load <2 x i64>, ptr %C
%tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
%tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
ret <2 x i64> %tmp5
}
define <4 x i32> @sqdmlsl4s(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: sqdmlsl4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x0]
; CHECK-NEXT: ldr d2, [x1]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: sqdmlsl v0.4s, v1.4h, v2.4h
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = load <4 x i32>, ptr %C
%tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
%tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
ret <4 x i32> %tmp5
}
define <2 x i64> @sqdmlsl2d(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: sqdmlsl2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x0]
; CHECK-NEXT: ldr d2, [x1]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: sqdmlsl v0.2d, v1.2s, v2.2s
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = load <2 x i64>, ptr %C
%tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
%tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
ret <2 x i64> %tmp5
}
define <4 x i32> @sqdmlsl2_4s(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-SD-LABEL: sqdmlsl2_4s:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: ldr q0, [x2]
; CHECK-SD-NEXT: ldr d1, [x0, #8]
; CHECK-SD-NEXT: ldr d2, [x1, #8]
; CHECK-SD-NEXT: sqdmlsl v0.4s, v1.4h, v2.4h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmlsl2_4s:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: ldr q1, [x0]
; CHECK-GI-NEXT: ldr q2, [x1]
; CHECK-GI-NEXT: ldr q0, [x2]
; CHECK-GI-NEXT: sqdmlsl2 v0.4s, v1.8h, v2.8h
; CHECK-GI-NEXT: ret
%load1 = load <8 x i16>, ptr %A
%load2 = load <8 x i16>, ptr %B
%tmp3 = load <4 x i32>, ptr %C
%tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
%tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
ret <4 x i32> %tmp5
}
define <2 x i64> @sqdmlsl2_2d(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-SD-LABEL: sqdmlsl2_2d:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: ldr q0, [x2]
; CHECK-SD-NEXT: ldr d1, [x0, #8]
; CHECK-SD-NEXT: ldr d2, [x1, #8]
; CHECK-SD-NEXT: sqdmlsl v0.2d, v1.2s, v2.2s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmlsl2_2d:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: ldr q1, [x0]
; CHECK-GI-NEXT: ldr q2, [x1]
; CHECK-GI-NEXT: ldr q0, [x2]
; CHECK-GI-NEXT: sqdmlsl2 v0.2d, v1.4s, v2.4s
; CHECK-GI-NEXT: ret
%load1 = load <4 x i32>, ptr %A
%load2 = load <4 x i32>, ptr %B
%tmp3 = load <2 x i64>, ptr %C
%tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
%tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
ret <2 x i64> %tmp5
}
define <4 x i32> @umlal4s(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: umlal4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x0]
; CHECK-NEXT: ldr d2, [x1]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: umlal v0.4s, v1.4h, v2.4h
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = load <4 x i32>, ptr %C
%tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
%tmp5 = add <4 x i32> %tmp3, %tmp4
ret <4 x i32> %tmp5
}
define <2 x i64> @umlal2d(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: umlal2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x0]
; CHECK-NEXT: ldr d2, [x1]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: umlal v0.2d, v1.2s, v2.2s
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = load <2 x i64>, ptr %C
%tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
%tmp5 = add <2 x i64> %tmp3, %tmp4
ret <2 x i64> %tmp5
}
define void @umlal8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
; CHECK-SD-LABEL: umlal8h_chain_with_constant:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: movi v3.16b, #1
; CHECK-SD-NEXT: umlal v3.8h, v0.8b, v2.8b
; CHECK-SD-NEXT: mvn v0.8b, v2.8b
; CHECK-SD-NEXT: umlal v3.8h, v1.8b, v0.8b
; CHECK-SD-NEXT: str q3, [x0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: umlal8h_chain_with_constant:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mvn v3.8b, v2.8b
; CHECK-GI-NEXT: umull v1.8h, v1.8b, v3.8b
; CHECK-GI-NEXT: movi v3.16b, #1
; CHECK-GI-NEXT: umlal v1.8h, v0.8b, v2.8b
; CHECK-GI-NEXT: add v0.8h, v1.8h, v3.8h
; CHECK-GI-NEXT: str q0, [x0]
; CHECK-GI-NEXT: ret
%xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%umull.1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v1, <8 x i8> %v3)
%add.1 = add <8 x i16> %umull.1, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
%umull.2 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v2, <8 x i8> %xor)
%add.2 = add <8 x i16> %add.1, %umull.2
store <8 x i16> %add.2, ptr %dst
ret void
}
define void @umlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
; CHECK-SD-LABEL: umlal2d_chain_with_constant:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: mov w8, #257 // =0x101
; CHECK-SD-NEXT: dup v3.2d, x8
; CHECK-SD-NEXT: umlal v3.2d, v0.2s, v2.2s
; CHECK-SD-NEXT: mvn v0.8b, v2.8b
; CHECK-SD-NEXT: umlal v3.2d, v1.2s, v0.2s
; CHECK-SD-NEXT: str q3, [x0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: umlal2d_chain_with_constant:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mvn v3.8b, v2.8b
; CHECK-GI-NEXT: adrp x8, .LCPI46_0
; CHECK-GI-NEXT: umull v1.2d, v1.2s, v3.2s
; CHECK-GI-NEXT: umlal v1.2d, v0.2s, v2.2s
; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI46_0]
; CHECK-GI-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-NEXT: str q0, [x0]
; CHECK-GI-NEXT: ret
%xor = xor <2 x i32> %v3, <i32 -1, i32 -1>
%umull.1 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v1, <2 x i32> %v3)
%add.1 = add <2 x i64> %umull.1, <i64 257, i64 257>
%umull.2 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v2, <2 x i32> %xor)
%add.2 = add <2 x i64> %add.1, %umull.2
store <2 x i64> %add.2, ptr %dst
ret void
}
define <4 x i32> @umlsl4s(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: umlsl4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x0]
; CHECK-NEXT: ldr d2, [x1]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.4h
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = load <4 x i32>, ptr %C
%tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
%tmp5 = sub <4 x i32> %tmp3, %tmp4
ret <4 x i32> %tmp5
}
define <2 x i64> @umlsl2d(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: umlsl2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x0]
; CHECK-NEXT: ldr d2, [x1]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.2s
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = load <2 x i64>, ptr %C
%tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
%tmp5 = sub <2 x i64> %tmp3, %tmp4
ret <2 x i64> %tmp5
}
define void @umlsl8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
; CHECK-LABEL: umlsl8h_chain_with_constant:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v3.16b, #1
; CHECK-NEXT: umlsl v3.8h, v0.8b, v2.8b
; CHECK-NEXT: mvn v0.8b, v2.8b
; CHECK-NEXT: umlsl v3.8h, v1.8b, v0.8b
; CHECK-NEXT: str q3, [x0]
; CHECK-NEXT: ret
%xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%umull.1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v1, <8 x i8> %v3)
%add.1 = sub <8 x i16> <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>, %umull.1
%umull.2 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v2, <8 x i8> %xor)
%add.2 = sub <8 x i16> %add.1, %umull.2
store <8 x i16> %add.2, ptr %dst
ret void
}
define void @umlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
; CHECK-SD-LABEL: umlsl2d_chain_with_constant:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: mov w8, #257 // =0x101
; CHECK-SD-NEXT: dup v3.2d, x8
; CHECK-SD-NEXT: umlsl v3.2d, v0.2s, v2.2s
; CHECK-SD-NEXT: mvn v0.8b, v2.8b
; CHECK-SD-NEXT: umlsl v3.2d, v1.2s, v0.2s
; CHECK-SD-NEXT: str q3, [x0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: umlsl2d_chain_with_constant:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: adrp x8, .LCPI50_0
; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI50_0]
; CHECK-GI-NEXT: umlsl v3.2d, v0.2s, v2.2s
; CHECK-GI-NEXT: mvn v0.8b, v2.8b
; CHECK-GI-NEXT: umlsl v3.2d, v1.2s, v0.2s
; CHECK-GI-NEXT: str q3, [x0]
; CHECK-GI-NEXT: ret
%xor = xor <2 x i32> %v3, <i32 -1, i32 -1>
%umull.1 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v1, <2 x i32> %v3)
%add.1 = sub <2 x i64> <i64 257, i64 257>, %umull.1
%umull.2 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v2, <2 x i32> %xor)
%add.2 = sub <2 x i64> %add.1, %umull.2
store <2 x i64> %add.2, ptr %dst
ret void
}
define <2 x float> @fmla_2s(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: fmla_2s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x0]
; CHECK-NEXT: ldr d2, [x1]
; CHECK-NEXT: ldr d0, [x2]
; CHECK-NEXT: fmla v0.2s, v2.2s, v1.2s
; CHECK-NEXT: ret
%tmp1 = load <2 x float>, ptr %A
%tmp2 = load <2 x float>, ptr %B
%tmp3 = load <2 x float>, ptr %C
%tmp4 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
ret <2 x float> %tmp4
}
define <4 x float> @fmla_4s(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: fmla_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q1, [x0]
; CHECK-NEXT: ldr q2, [x1]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: fmla v0.4s, v2.4s, v1.4s
; CHECK-NEXT: ret
%tmp1 = load <4 x float>, ptr %A
%tmp2 = load <4 x float>, ptr %B
%tmp3 = load <4 x float>, ptr %C
%tmp4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
ret <4 x float> %tmp4
}
define <2 x double> @fmla_2d(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: fmla_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q1, [x0]
; CHECK-NEXT: ldr q2, [x1]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: fmla v0.2d, v2.2d, v1.2d
; CHECK-NEXT: ret
%tmp1 = load <2 x double>, ptr %A
%tmp2 = load <2 x double>, ptr %B
%tmp3 = load <2 x double>, ptr %C
%tmp4 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
ret <2 x double> %tmp4
}
declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
define <2 x float> @fmls_2s(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: fmls_2s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x0]
; CHECK-NEXT: ldr d2, [x1]
; CHECK-NEXT: ldr d0, [x2]
; CHECK-NEXT: fmls v0.2s, v1.2s, v2.2s
; CHECK-NEXT: ret
%tmp1 = load <2 x float>, ptr %A
%tmp2 = load <2 x float>, ptr %B
%tmp3 = load <2 x float>, ptr %C
%tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
%tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp4, <2 x float> %tmp3)
ret <2 x float> %tmp5
}
define <4 x float> @fmls_4s(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: fmls_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q1, [x0]
; CHECK-NEXT: ldr q2, [x1]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: fmls v0.4s, v1.4s, v2.4s
; CHECK-NEXT: ret
%tmp1 = load <4 x float>, ptr %A
%tmp2 = load <4 x float>, ptr %B
%tmp3 = load <4 x float>, ptr %C
%tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
%tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp4, <4 x float> %tmp3)
ret <4 x float> %tmp5
}
define <2 x double> @fmls_2d(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: fmls_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q1, [x0]
; CHECK-NEXT: ldr q2, [x1]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: fmls v0.2d, v1.2d, v2.2d
; CHECK-NEXT: ret
%tmp1 = load <2 x double>, ptr %A
%tmp2 = load <2 x double>, ptr %B
%tmp3 = load <2 x double>, ptr %C
%tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
%tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp4, <2 x double> %tmp3)
ret <2 x double> %tmp5
}
define <2 x float> @fmls_commuted_neg_2s(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: fmls_commuted_neg_2s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x0]
; CHECK-NEXT: ldr d2, [x1]
; CHECK-NEXT: ldr d0, [x2]
; CHECK-NEXT: fmls v0.2s, v1.2s, v2.2s
; CHECK-NEXT: ret
%tmp1 = load <2 x float>, ptr %A
%tmp2 = load <2 x float>, ptr %B
%tmp3 = load <2 x float>, ptr %C
%tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
%tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp4, <2 x float> %tmp1, <2 x float> %tmp3)
ret <2 x float> %tmp5
}
define <4 x float> @fmls_commuted_neg_4s(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: fmls_commuted_neg_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q1, [x0]
; CHECK-NEXT: ldr q2, [x1]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: fmls v0.4s, v1.4s, v2.4s
; CHECK-NEXT: ret
%tmp1 = load <4 x float>, ptr %A
%tmp2 = load <4 x float>, ptr %B
%tmp3 = load <4 x float>, ptr %C
%tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
%tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp4, <4 x float> %tmp1, <4 x float> %tmp3)
ret <4 x float> %tmp5
}
define <2 x double> @fmls_commuted_neg_2d(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: fmls_commuted_neg_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q1, [x0]
; CHECK-NEXT: ldr q2, [x1]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: fmls v0.2d, v1.2d, v2.2d
; CHECK-NEXT: ret
%tmp1 = load <2 x double>, ptr %A
%tmp2 = load <2 x double>, ptr %B
%tmp3 = load <2 x double>, ptr %C
%tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
%tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp4, <2 x double> %tmp1, <2 x double> %tmp3)
ret <2 x double> %tmp5
}
define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
; CHECK-LABEL: fmls_indexed_2s:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: fmls v0.2s, v2.2s, v1.s[0]
; CHECK-NEXT: ret
entry:
%0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c
%lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
%fmls1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a)
ret <2 x float> %fmls1
}
define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
; CHECK-LABEL: fmls_indexed_4s:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmls v0.4s, v2.4s, v1.s[0]
; CHECK-NEXT: ret
entry:
%0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
%lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
%fmls1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a)
ret <4 x float> %fmls1
}
define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp {
; CHECK-LABEL: fmls_indexed_2d:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmls v0.2d, v2.2d, v1.d[0]
; CHECK-NEXT: ret
entry:
%0 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
%lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
%fmls1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a)
ret <2 x double> %fmls1
}
define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp {
; CHECK-LABEL: fmla_indexed_scalar_2s:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $s2 killed $s2 def $d2
; CHECK-NEXT: fmla v0.2s, v1.2s, v2.2s
; CHECK-NEXT: ret
entry:
%v1 = insertelement <2 x float> undef, float %c, i32 0
%v2 = insertelement <2 x float> %v1, float %c, i32 1
%fmla1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %v1, <2 x float> %b, <2 x float> %a) nounwind
ret <2 x float> %fmla1
}
define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp {
; CHECK-LABEL: fmla_indexed_scalar_4s:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2
; CHECK-NEXT: fmla v0.4s, v1.4s, v2.s[0]
; CHECK-NEXT: ret
entry:
%v1 = insertelement <4 x float> undef, float %c, i32 0
%v2 = insertelement <4 x float> %v1, float %c, i32 1
%v3 = insertelement <4 x float> %v2, float %c, i32 2
%v4 = insertelement <4 x float> %v3, float %c, i32 3
%fmla1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %fmla1
}
define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp {
; CHECK-LABEL: fmla_indexed_scalar_2d:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: fmla v0.2d, v1.2d, v2.d[0]
; CHECK-NEXT: ret
entry:
%v1 = insertelement <2 x double> undef, double %c, i32 0
%v2 = insertelement <2 x double> %v1, double %c, i32 1
%fmla1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %fmla1
}
define <2 x float> @fmls_indexed_2s_strict(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp strictfp {
; CHECK-LABEL: fmls_indexed_2s_strict:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: fmls v0.2s, v2.2s, v1.s[0]
; CHECK-NEXT: ret
entry:
%0 = fneg <2 x float> %c
%lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
%fmls1 = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
ret <2 x float> %fmls1
}
define <4 x float> @fmls_indexed_4s_strict(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp strictfp {
; CHECK-LABEL: fmls_indexed_4s_strict:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmls v0.4s, v2.4s, v1.s[0]
; CHECK-NEXT: ret
entry:
%0 = fneg <4 x float> %c
%lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
%fmls1 = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
ret <4 x float> %fmls1
}
define <2 x double> @fmls_indexed_2d_strict(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp strictfp {
; CHECK-LABEL: fmls_indexed_2d_strict:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmls v0.2d, v2.2d, v1.d[0]
; CHECK-NEXT: ret
entry:
%0 = fneg <2 x double> %c
%lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
%fmls1 = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
ret <2 x double> %fmls1
}
define <2 x float> @fmla_indexed_scalar_2s_strict(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp strictfp {
; CHECK-LABEL: fmla_indexed_scalar_2s_strict:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2
; CHECK-NEXT: fmla v0.2s, v1.2s, v2.s[0]
; CHECK-NEXT: ret
entry:
%v1 = insertelement <2 x float> undef, float %c, i32 0
%v2 = insertelement <2 x float> %v1, float %c, i32 1
%fmla1 = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %v2, <2 x float> %b, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
ret <2 x float> %fmla1
}
define <4 x float> @fmla_indexed_scalar_4s_strict(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp strictfp {
; CHECK-LABEL: fmla_indexed_scalar_4s_strict:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2
; CHECK-NEXT: fmla v0.4s, v1.4s, v2.s[0]
; CHECK-NEXT: ret
entry:
%v1 = insertelement <4 x float> undef, float %c, i32 0
%v2 = insertelement <4 x float> %v1, float %c, i32 1
%v3 = insertelement <4 x float> %v2, float %c, i32 2
%v4 = insertelement <4 x float> %v3, float %c, i32 3
%fmla1 = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
ret <4 x float> %fmla1
}
define <2 x double> @fmla_indexed_scalar_2d_strict(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp strictfp {
; CHECK-LABEL: fmla_indexed_scalar_2d_strict:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: fmla v0.2d, v1.2d, v2.d[0]
; CHECK-NEXT: ret
entry:
%v1 = insertelement <2 x double> undef, double %c, i32 0
%v2 = insertelement <2 x double> %v1, double %c, i32 1
%fmla1 = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
ret <2 x double> %fmla1
}
attributes #0 = { strictfp }
declare <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float>, <2 x float>, <2 x float>, metadata, metadata)
declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata)
define <4 x i16> @mul_4h(<4 x i16> %A, <4 x i16> %B) nounwind {
; CHECK-LABEL: mul_4h:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: mul v0.4h, v0.4h, v1.h[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = mul <4 x i16> %A, %tmp3
ret <4 x i16> %tmp4
}
define <8 x i16> @mul_8h(<8 x i16> %A, <8 x i16> %B) nounwind {
; CHECK-LABEL: mul_8h:
; CHECK: // %bb.0:
; CHECK-NEXT: mul v0.8h, v0.8h, v1.h[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%tmp4 = mul <8 x i16> %A, %tmp3
ret <8 x i16> %tmp4
}
define <2 x i32> @mul_2s(<2 x i32> %A, <2 x i32> %B) nounwind {
; CHECK-LABEL: mul_2s:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: mul v0.2s, v0.2s, v1.s[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp4 = mul <2 x i32> %A, %tmp3
ret <2 x i32> %tmp4
}
define <4 x i32> @mul_4s(<4 x i32> %A, <4 x i32> %B) nounwind {
; CHECK-LABEL: mul_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: mul v0.4s, v0.4s, v1.s[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = mul <4 x i32> %A, %tmp3
ret <4 x i32> %tmp4
}
define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind {
; CHECK-SD-LABEL: mul_2d:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: fmov x10, d1
; CHECK-SD-NEXT: fmov x11, d0
; CHECK-SD-NEXT: mov x8, v1.d[1]
; CHECK-SD-NEXT: mov x9, v0.d[1]
; CHECK-SD-NEXT: mul x10, x11, x10
; CHECK-SD-NEXT: mul x8, x9, x8
; CHECK-SD-NEXT: fmov d0, x10
; CHECK-SD-NEXT: mov v0.d[1], x8
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: mul_2d:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov x10, d0
; CHECK-GI-NEXT: fmov x11, d1
; CHECK-GI-NEXT: mov x8, v0.d[1]
; CHECK-GI-NEXT: mov x9, v1.d[1]
; CHECK-GI-NEXT: mul x10, x10, x11
; CHECK-GI-NEXT: mul x8, x8, x9
; CHECK-GI-NEXT: fmov d0, x10
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: ret
%tmp1 = mul <2 x i64> %A, %B
ret <2 x i64> %tmp1
}
define <2 x float> @fmul_lane_2s(<2 x float> %A, <2 x float> %B) nounwind {
; CHECK-LABEL: fmul_lane_2s:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <2 x float> %B, <2 x float> poison, <2 x i32> <i32 1, i32 1>
%tmp4 = fmul <2 x float> %A, %tmp3
ret <2 x float> %tmp4
}
define <4 x float> @fmul_lane_4s(<4 x float> %A, <4 x float> %B) nounwind {
; CHECK-LABEL: fmul_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: fmul v0.4s, v0.4s, v1.s[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <4 x float> %B, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = fmul <4 x float> %A, %tmp3
ret <4 x float> %tmp4
}
define <2 x double> @fmul_lane_2d(<2 x double> %A, <2 x double> %B) nounwind {
; CHECK-LABEL: fmul_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: fmul v0.2d, v0.2d, v1.d[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <2 x double> %B, <2 x double> poison, <2 x i32> <i32 1, i32 1>
%tmp4 = fmul <2 x double> %A, %tmp3
ret <2 x double> %tmp4
}
define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind {
; CHECK-LABEL: fmul_lane_s:
; CHECK: // %bb.0:
; CHECK-NEXT: fmul s0, s0, v1.s[3]
; CHECK-NEXT: ret
%B = extractelement <4 x float> %vec, i32 3
%res = fmul float %A, %B
ret float %res
}
define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind {
; CHECK-LABEL: fmul_lane_d:
; CHECK: // %bb.0:
; CHECK-NEXT: fmul d0, d0, v1.d[1]
; CHECK-NEXT: ret
%B = extractelement <2 x double> %vec, i32 1
%res = fmul double %A, %B
ret double %res
}
define <2 x float> @fmulx_lane_2s(<2 x float> %A, <2 x float> %B) nounwind {
; CHECK-LABEL: fmulx_lane_2s:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: fmulx v0.2s, v0.2s, v1.s[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <2 x float> %B, <2 x float> poison, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %A, <2 x float> %tmp3)
ret <2 x float> %tmp4
}
define <4 x float> @fmulx_lane_4s(<4 x float> %A, <4 x float> %B) nounwind {
; CHECK-LABEL: fmulx_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: fmulx v0.4s, v0.4s, v1.s[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <4 x float> %B, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %A, <4 x float> %tmp3)
ret <4 x float> %tmp4
}
define <2 x double> @fmulx_lane_2d(<2 x double> %A, <2 x double> %B) nounwind {
; CHECK-LABEL: fmulx_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: fmulx v0.2d, v0.2d, v1.d[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <2 x double> %B, <2 x double> poison, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %A, <2 x double> %tmp3)
ret <2 x double> %tmp4
}
define <4 x i16> @sqdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind {
; CHECK-LABEL: sqdmulh_lane_4h:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmulh v0.4h, v0.4h, v1.h[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %A, <4 x i16> %tmp3)
ret <4 x i16> %tmp4
}
define <8 x i16> @sqdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind {
; CHECK-LABEL: sqdmulh_lane_8h:
; CHECK: // %bb.0:
; CHECK-NEXT: sqdmulh v0.8h, v0.8h, v1.h[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %A, <8 x i16> %tmp3)
ret <8 x i16> %tmp4
}
define <2 x i32> @sqdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind {
; CHECK-LABEL: sqdmulh_lane_2s:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmulh v0.2s, v0.2s, v1.s[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %A, <2 x i32> %tmp3)
ret <2 x i32> %tmp4
}
define <4 x i32> @sqdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind {
; CHECK-LABEL: sqdmulh_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: sqdmulh v0.4s, v0.4s, v1.s[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %A, <4 x i32> %tmp3)
ret <4 x i32> %tmp4
}
define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
; CHECK-LABEL: sqdmulh_lane_1s:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s1, w0
; CHECK-NEXT: sqdmulh s0, s1, v0.s[1]
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%tmp1 = extractelement <4 x i32> %B, i32 1
%tmp2 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %A, i32 %tmp1)
ret i32 %tmp2
}
define <4 x i16> @sqrdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind {
; CHECK-LABEL: sqrdmulh_lane_4h:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %A, <4 x i16> %tmp3)
ret <4 x i16> %tmp4
}
define <8 x i16> @sqrdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind {
; CHECK-LABEL: sqrdmulh_lane_8h:
; CHECK: // %bb.0:
; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %A, <8 x i16> %tmp3)
ret <8 x i16> %tmp4
}
define <2 x i32> @sqrdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind {
; CHECK-LABEL: sqrdmulh_lane_2s:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %A, <2 x i32> %tmp3)
ret <2 x i32> %tmp4
}
define <4 x i32> @sqrdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind {
; CHECK-LABEL: sqrdmulh_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %A, <4 x i32> %tmp3)
ret <4 x i32> %tmp4
}
define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
; CHECK-LABEL: sqrdmulh_lane_1s:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s1, w0
; CHECK-NEXT: sqrdmulh s0, s1, v0.s[1]
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%tmp1 = extractelement <4 x i32> %B, i32 1
%tmp2 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1)
ret i32 %tmp2
}
define <4 x i32> @sqdmull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind {
; CHECK-LABEL: sqdmull_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmull v0.4s, v0.4h, v1.h[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @sqdmull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind {
; CHECK-LABEL: sqdmull_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmull v0.2d, v0.2s, v1.s[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
define <4 x i32> @sqdmull2_lane_4s(<8 x i16> %A, <8 x i16> %B) nounwind {
; CHECK-SD-LABEL: sqdmull2_lane_4s:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: sqdmull2 v0.4s, v0.8h, v1.h[1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmull2_lane_4s:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov d0, v0.d[1]
; CHECK-GI-NEXT: sqdmull v0.4s, v0.4h, v1.h[1]
; CHECK-GI-NEXT: ret
%tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%tmp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp4
}
define <2 x i64> @sqdmull2_lane_2d(<4 x i32> %A, <4 x i32> %B) nounwind {
; CHECK-SD-LABEL: sqdmull2_lane_2d:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: sqdmull2 v0.2d, v0.4s, v1.s[1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmull2_lane_2d:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov d0, v0.d[1]
; CHECK-GI-NEXT: sqdmull v0.2d, v0.2s, v1.s[1]
; CHECK-GI-NEXT: ret
%tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%tmp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp4
}
define <4 x i32> @umull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind {
; CHECK-LABEL: umull_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: umull v0.4s, v0.4h, v1.h[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @umull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind {
; CHECK-LABEL: umull_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: umull v0.2d, v0.2s, v1.s[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
define <4 x i32> @smull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind {
; CHECK-LABEL: smull_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: smull v0.4s, v0.4h, v1.h[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @smull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind {
; CHECK-LABEL: smull_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: smull v0.2d, v0.2s, v1.s[1]
; CHECK-NEXT: ret
%tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
define <4 x i32> @smlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
; CHECK-LABEL: smlal_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: smlal v2.4s, v0.4h, v1.h[1]
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
%tmp6 = add <4 x i32> %C, %tmp5
ret <4 x i32> %tmp6
}
define <2 x i64> @smlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
; CHECK-LABEL: smlal_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: smlal v2.2d, v0.2s, v1.s[1]
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
%tmp6 = add <2 x i64> %C, %tmp5
ret <2 x i64> %tmp6
}
define <4 x i32> @sqdmlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
; CHECK-LABEL: sqdmlal_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmlal v2.4s, v0.4h, v1.h[1]
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
%tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %C, <4 x i32> %tmp5)
ret <4 x i32> %tmp6
}
define <2 x i64> @sqdmlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
; CHECK-LABEL: sqdmlal_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmlal v2.2d, v0.2s, v1.s[1]
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
%tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %C, <2 x i64> %tmp5)
ret <2 x i64> %tmp6
}
define <4 x i32> @sqdmlal2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nounwind {
; CHECK-SD-LABEL: sqdmlal2_lane_4s:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: sqdmlal2 v2.4s, v0.8h, v1.h[1]
; CHECK-SD-NEXT: mov v0.16b, v2.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmlal2_lane_4s:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov d3, v0.d[1]
; CHECK-GI-NEXT: mov v0.16b, v2.16b
; CHECK-GI-NEXT: sqdmlal v0.4s, v3.4h, v1.h[1]
; CHECK-GI-NEXT: ret
%tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%tmp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
%tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %C, <4 x i32> %tmp5)
ret <4 x i32> %tmp6
}
define <2 x i64> @sqdmlal2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nounwind {
; CHECK-SD-LABEL: sqdmlal2_lane_2d:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: sqdmlal2 v2.2d, v0.4s, v1.s[1]
; CHECK-SD-NEXT: mov v0.16b, v2.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmlal2_lane_2d:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov d3, v0.d[1]
; CHECK-GI-NEXT: mov v0.16b, v2.16b
; CHECK-GI-NEXT: sqdmlal v0.2d, v3.2s, v1.s[1]
; CHECK-GI-NEXT: ret
%tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%tmp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
%tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %C, <2 x i64> %tmp5)
ret <2 x i64> %tmp6
}
define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
; CHECK-SD-LABEL: sqdmlal_lane_1s:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: fmov s1, w0
; CHECK-SD-NEXT: fmov s2, w1
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: sqdmlal s1, h2, v0.h[1]
; CHECK-SD-NEXT: fmov w0, s1
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmlal_lane_1s:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov s1, w1
; CHECK-GI-NEXT: fmov s2, w0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: sqdmlal s2, h1, v0.h[1]
; CHECK-GI-NEXT: fmov w0, s2
; CHECK-GI-NEXT: ret
%lhs = insertelement <4 x i16> undef, i16 %B, i32 0
%rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
%prod = extractelement <4 x i32> %prod.vec, i32 0
%res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
ret i32 %res
}
declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
; CHECK-SD-LABEL: sqdmlsl_lane_1s:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: fmov s1, w0
; CHECK-SD-NEXT: fmov s2, w1
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: sqdmlsl s1, h2, v0.h[1]
; CHECK-SD-NEXT: fmov w0, s1
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmlsl_lane_1s:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov s1, w1
; CHECK-GI-NEXT: fmov s2, w0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: sqdmlsl s2, h1, v0.h[1]
; CHECK-GI-NEXT: fmov w0, s2
; CHECK-GI-NEXT: ret
%lhs = insertelement <4 x i16> undef, i16 %B, i32 0
%rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
%prod = extractelement <4 x i32> %prod.vec, i32 0
%res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
ret i32 %res
}
declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
define i32 @sqadd_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind {
; CHECK-LABEL: sqadd_lane1_sqdmull4s:
; CHECK: // %bb.0:
; CHECK-NEXT: sqdmull v0.4s, v0.4h, v1.4h
; CHECK-NEXT: fmov s1, w0
; CHECK-NEXT: mov s0, v0.s[1]
; CHECK-NEXT: sqadd s0, s1, s0
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C)
%prod = extractelement <4 x i32> %prod.vec, i32 1
%res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
ret i32 %res
}
define i32 @sqsub_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind {
; CHECK-LABEL: sqsub_lane1_sqdmull4s:
; CHECK: // %bb.0:
; CHECK-NEXT: sqdmull v0.4s, v0.4h, v1.4h
; CHECK-NEXT: fmov s1, w0
; CHECK-NEXT: mov s0, v0.s[1]
; CHECK-NEXT: sqsub s0, s1, s0
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C)
%prod = extractelement <4 x i32> %prod.vec, i32 1
%res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
ret i32 %res
}
define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
; CHECK-LABEL: sqdmlal_lane_1d:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s1, w1
; CHECK-NEXT: fmov d2, x0
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: sqdmlal d2, s1, v0.s[1]
; CHECK-NEXT: fmov x0, d2
; CHECK-NEXT: ret
%rhs = extractelement <2 x i32> %C, i32 1
%prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
%res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod)
ret i64 %res
}
declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32)
declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64)
define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
; CHECK-LABEL: sqdmlsl_lane_1d:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s1, w1
; CHECK-NEXT: fmov d2, x0
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: sqdmlsl d2, s1, v0.s[1]
; CHECK-NEXT: fmov x0, d2
; CHECK-NEXT: ret
%rhs = extractelement <2 x i32> %C, i32 1
%prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
%res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod)
ret i64 %res
}
declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64)
define <4 x i32> @umlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
; CHECK-LABEL: umlal_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: umlal v2.4s, v0.4h, v1.h[1]
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
%tmp6 = add <4 x i32> %C, %tmp5
ret <4 x i32> %tmp6
}
define <2 x i64> @umlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
; CHECK-LABEL: umlal_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: umlal v2.2d, v0.2s, v1.s[1]
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
%tmp6 = add <2 x i64> %C, %tmp5
ret <2 x i64> %tmp6
}
define <4 x i32> @smlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
; CHECK-LABEL: smlsl_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: smlsl v2.4s, v0.4h, v1.h[1]
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
%tmp6 = sub <4 x i32> %C, %tmp5
ret <4 x i32> %tmp6
}
define <2 x i64> @smlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
; CHECK-LABEL: smlsl_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: smlsl v2.2d, v0.2s, v1.s[1]
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
%tmp6 = sub <2 x i64> %C, %tmp5
ret <2 x i64> %tmp6
}
define <4 x i32> @sqdmlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
; CHECK-LABEL: sqdmlsl_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmlsl v2.4s, v0.4h, v1.h[1]
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
%tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %C, <4 x i32> %tmp5)
ret <4 x i32> %tmp6
}
define <2 x i64> @sqdmlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
; CHECK-LABEL: sqdmlsl_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmlsl v2.2d, v0.2s, v1.s[1]
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
%tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %C, <2 x i64> %tmp5)
ret <2 x i64> %tmp6
}
define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nounwind {
; CHECK-SD-LABEL: sqdmlsl2_lane_4s:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: sqdmlsl2 v2.4s, v0.8h, v1.h[1]
; CHECK-SD-NEXT: mov v0.16b, v2.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmlsl2_lane_4s:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov d3, v0.d[1]
; CHECK-GI-NEXT: mov v0.16b, v2.16b
; CHECK-GI-NEXT: sqdmlsl v0.4s, v3.4h, v1.h[1]
; CHECK-GI-NEXT: ret
%tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%tmp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
%tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %C, <4 x i32> %tmp5)
ret <4 x i32> %tmp6
}
define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nounwind {
; CHECK-SD-LABEL: sqdmlsl2_lane_2d:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: sqdmlsl2 v2.2d, v0.4s, v1.s[1]
; CHECK-SD-NEXT: mov v0.16b, v2.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmlsl2_lane_2d:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov d3, v0.d[1]
; CHECK-GI-NEXT: mov v0.16b, v2.16b
; CHECK-GI-NEXT: sqdmlsl v0.2d, v3.2s, v1.s[1]
; CHECK-GI-NEXT: ret
%tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%tmp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
%tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %C, <2 x i64> %tmp5)
ret <2 x i64> %tmp6
}
define <4 x i32> @umlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
; CHECK-LABEL: umlsl_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: umlsl v2.4s, v0.4h, v1.h[1]
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
%tmp6 = sub <4 x i32> %C, %tmp5
ret <4 x i32> %tmp6
}
define <2 x i64> @umlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
; CHECK-LABEL: umlsl_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: umlsl v2.2d, v0.2s, v1.s[1]
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
%tmp6 = sub <2 x i64> %C, %tmp5
ret <2 x i64> %tmp6
}
; Scalar FMULX
define float @fmulxs(float %a, float %b) nounwind {
; CHECK-LABEL: fmulxs:
; CHECK: // %bb.0:
; CHECK-NEXT: fmulx s0, s0, s1
; CHECK-NEXT: ret
%fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
ret float %fmulx.i
}
define double @fmulxd(double %a, double %b) nounwind {
; CHECK-LABEL: fmulxd:
; CHECK: // %bb.0:
; CHECK-NEXT: fmulx d0, d0, d1
; CHECK-NEXT: ret
%fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
ret double %fmulx.i
}
define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind {
; CHECK-LABEL: fmulxs_lane:
; CHECK: // %bb.0:
; CHECK-NEXT: fmulx s0, s0, v1.s[3]
; CHECK-NEXT: ret
%b = extractelement <4 x float> %vec, i32 3
%fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
ret float %fmulx.i
}
define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind {
; CHECK-LABEL: fmulxd_lane:
; CHECK: // %bb.0:
; CHECK-NEXT: fmulx d0, d0, v1.d[1]
; CHECK-NEXT: ret
%b = extractelement <2 x double> %vec, i32 1
%fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
ret double %fmulx.i
}
declare double @llvm.aarch64.neon.fmulx.f64(double, double) nounwind readnone
declare float @llvm.aarch64.neon.fmulx.f32(float, float) nounwind readnone
define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind {
; CHECK-LABEL: smull2_8h_simple:
; CHECK: // %bb.0:
; CHECK-NEXT: smull2 v0.8h, v0.16b, v1.16b
; CHECK-NEXT: ret
%1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%3 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
ret <8 x i16> %3
}
define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind {
; CHECK-LABEL: foo0:
; CHECK: // %bb.0:
; CHECK-NEXT: smull2 v0.8h, v0.16b, v1.16b
; CHECK-NEXT: ret
%tmp = bitcast <16 x i8> %a to <2 x i64>
%shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
%tmp2 = bitcast <16 x i8> %b to <2 x i64>
%shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
%tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
%vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
ret <8 x i16> %vmull.i.i
}
define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind {
; CHECK-LABEL: foo1:
; CHECK: // %bb.0:
; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h
; CHECK-NEXT: ret
%tmp = bitcast <8 x i16> %a to <2 x i64>
%shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
%tmp2 = bitcast <8 x i16> %b to <2 x i64>
%shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
%tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
%vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
ret <4 x i32> %vmull2.i.i
}
define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind {
; CHECK-LABEL: foo2:
; CHECK: // %bb.0:
; CHECK-NEXT: smull2 v0.2d, v0.4s, v1.4s
; CHECK-NEXT: ret
%tmp = bitcast <4 x i32> %a to <2 x i64>
%shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
%tmp2 = bitcast <4 x i32> %b to <2 x i64>
%shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
%tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
%vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
ret <2 x i64> %vmull2.i.i
}
define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind {
; CHECK-LABEL: foo3:
; CHECK: // %bb.0:
; CHECK-NEXT: umull2 v0.8h, v0.16b, v1.16b
; CHECK-NEXT: ret
%tmp = bitcast <16 x i8> %a to <2 x i64>
%shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
%tmp2 = bitcast <16 x i8> %b to <2 x i64>
%shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
%tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
%vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
ret <8 x i16> %vmull.i.i
}
define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind {
; CHECK-LABEL: foo4:
; CHECK: // %bb.0:
; CHECK-NEXT: umull2 v0.4s, v0.8h, v1.8h
; CHECK-NEXT: ret
%tmp = bitcast <8 x i16> %a to <2 x i64>
%shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
%tmp2 = bitcast <8 x i16> %b to <2 x i64>
%shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
%tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
%vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
ret <4 x i32> %vmull2.i.i
}
define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind {
; CHECK-LABEL: foo5:
; CHECK: // %bb.0:
; CHECK-NEXT: umull2 v0.2d, v0.4s, v1.4s
; CHECK-NEXT: ret
%tmp = bitcast <4 x i32> %a to <2 x i64>
%shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
%tmp2 = bitcast <4 x i32> %b to <2 x i64>
%shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
%tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
%vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
ret <2 x i64> %vmull2.i.i
}
define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
; CHECK-SD-LABEL: foo6:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-SD-NEXT: smull2 v0.4s, v1.8h, v2.h[1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: foo6:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov d0, v1.d[1]
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: smull v0.4s, v0.4h, v2.h[1]
; CHECK-GI-NEXT: ret
entry:
%0 = bitcast <8 x i16> %b to <2 x i64>
%shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
%1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
%shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
ret <4 x i32> %vmull2.i
}
define <4 x i32> @foo6a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: foo6a:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: smull v0.4s, v1.4h, v2.h[1]
; CHECK-NEXT: ret
entry:
%0 = bitcast <8 x i16> %b to <2 x i64>
%shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
%1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
%shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
ret <4 x i32> %vmull2.i
}
define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
; CHECK-SD-LABEL: foo7:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-SD-NEXT: smull2 v0.2d, v1.4s, v2.s[1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: foo7:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov d0, v1.d[1]
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: smull v0.2d, v0.2s, v2.s[1]
; CHECK-GI-NEXT: ret
entry:
%0 = bitcast <4 x i32> %b to <2 x i64>
%shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
%1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
%shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
ret <2 x i64> %vmull2.i
}
define <2 x i64> @foo7a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: foo7a:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: smull v0.2d, v1.2s, v2.s[1]
; CHECK-NEXT: ret
entry:
%0 = bitcast <4 x i32> %b to <2 x i64>
%shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
%1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
%shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
ret <2 x i64> %vmull2.i
}
define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
; CHECK-SD-LABEL: foo8:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-SD-NEXT: umull2 v0.4s, v1.8h, v2.h[1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: foo8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov d0, v1.d[1]
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: umull v0.4s, v0.4h, v2.h[1]
; CHECK-GI-NEXT: ret
entry:
%0 = bitcast <8 x i16> %b to <2 x i64>
%shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
%1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
%shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
ret <4 x i32> %vmull2.i
}
define <4 x i32> @foo8a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: foo8a:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: umull v0.4s, v1.4h, v2.h[1]
; CHECK-NEXT: ret
entry:
%0 = bitcast <8 x i16> %b to <2 x i64>
%shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
%1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
%shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
ret <4 x i32> %vmull2.i
}
define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
; CHECK-SD-LABEL: foo9:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-SD-NEXT: umull2 v0.2d, v1.4s, v2.s[1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: foo9:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov d0, v1.d[1]
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: umull v0.2d, v0.2s, v2.s[1]
; CHECK-GI-NEXT: ret
entry:
%0 = bitcast <4 x i32> %b to <2 x i64>
%shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
%1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
%shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
ret <2 x i64> %vmull2.i
}
define <2 x i64> @foo9a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: foo9a:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: umull v0.2d, v1.2s, v2.s[1]
; CHECK-NEXT: ret
entry:
%0 = bitcast <4 x i32> %b to <2 x i64>
%shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
%1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
%shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
ret <2 x i64> %vmull2.i
}
define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
; CHECK-LABEL: bar0:
; CHECK: // %bb.0:
; CHECK-NEXT: smlal2 v0.8h, v1.16b, v2.16b
; CHECK-NEXT: ret
%tmp = bitcast <16 x i8> %b to <2 x i64>
%shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
%tmp2 = bitcast <16 x i8> %c to <2 x i64>
%shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
%tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
%vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
%add.i = add <8 x i16> %vmull.i.i.i, %a
ret <8 x i16> %add.i
}
define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
; CHECK-LABEL: bar1:
; CHECK: // %bb.0:
; CHECK-NEXT: smlal2 v0.4s, v1.8h, v2.8h
; CHECK-NEXT: ret
%tmp = bitcast <8 x i16> %b to <2 x i64>
%shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
%tmp2 = bitcast <8 x i16> %c to <2 x i64>
%shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
%tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
%vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
%add.i = add <4 x i32> %vmull2.i.i.i, %a
ret <4 x i32> %add.i
}
define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
; CHECK-LABEL: bar2:
; CHECK: // %bb.0:
; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s
; CHECK-NEXT: ret
%tmp = bitcast <4 x i32> %b to <2 x i64>
%shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
%tmp2 = bitcast <4 x i32> %c to <2 x i64>
%shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
%tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
%vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
%add.i = add <2 x i64> %vmull2.i.i.i, %a
ret <2 x i64> %add.i
}
define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
; CHECK-LABEL: bar3:
; CHECK: // %bb.0:
; CHECK-NEXT: umlal2 v0.8h, v1.16b, v2.16b
; CHECK-NEXT: ret
%tmp = bitcast <16 x i8> %b to <2 x i64>
%shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
%tmp2 = bitcast <16 x i8> %c to <2 x i64>
%shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
%tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
%vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
%add.i = add <8 x i16> %vmull.i.i.i, %a
ret <8 x i16> %add.i
}
define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
; CHECK-LABEL: bar4:
; CHECK: // %bb.0:
; CHECK-NEXT: umlal2 v0.4s, v1.8h, v2.8h
; CHECK-NEXT: ret
%tmp = bitcast <8 x i16> %b to <2 x i64>
%shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
%tmp2 = bitcast <8 x i16> %c to <2 x i64>
%shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
%tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
%vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
%add.i = add <4 x i32> %vmull2.i.i.i, %a
ret <4 x i32> %add.i
}
define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
; CHECK-LABEL: bar5:
; CHECK: // %bb.0:
; CHECK-NEXT: umlal2 v0.2d, v1.4s, v2.4s
; CHECK-NEXT: ret
%tmp = bitcast <4 x i32> %b to <2 x i64>
%shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
%tmp2 = bitcast <4 x i32> %c to <2 x i64>
%shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
%tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
%vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
%add.i = add <2 x i64> %vmull2.i.i.i, %a
ret <2 x i64> %add.i
}
define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
; CHECK-SD-LABEL: mlal2_1:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-SD-NEXT: smlal2 v0.4s, v1.8h, v2.h[3]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: mlal2_1:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: dup v2.8h, v2.h[3]
; CHECK-GI-NEXT: smlal2 v0.4s, v1.8h, v2.8h
; CHECK-GI-NEXT: ret
%shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%tmp = bitcast <8 x i16> %b to <2 x i64>
%shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
%tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
%shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
%tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
%vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
%add = add <4 x i32> %vmull2.i.i, %a
ret <4 x i32> %add
}
define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
; CHECK-SD-LABEL: mlal2_2:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-SD-NEXT: smlal2 v0.2d, v1.4s, v2.s[1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: mlal2_2:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: dup v2.4s, v2.s[1]
; CHECK-GI-NEXT: smlal2 v0.2d, v1.4s, v2.4s
; CHECK-GI-NEXT: ret
%shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp = bitcast <4 x i32> %b to <2 x i64>
%shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
%tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
%shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
%tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
%vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
%add = add <2 x i64> %vmull2.i.i, %a
ret <2 x i64> %add
}
define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
; CHECK-SD-LABEL: mlal2_4:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-SD-NEXT: umlal2 v0.4s, v1.8h, v2.h[2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: mlal2_4:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: dup v2.8h, v2.h[2]
; CHECK-GI-NEXT: umlal2 v0.4s, v1.8h, v2.8h
; CHECK-GI-NEXT: ret
%shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
%tmp = bitcast <8 x i16> %b to <2 x i64>
%shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
%tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
%shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
%tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
%vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
%add = add <4 x i32> %vmull2.i.i, %a
ret <4 x i32> %add
}
define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
; CHECK-SD-LABEL: mlal2_5:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-SD-NEXT: umlal2 v0.2d, v1.4s, v2.s[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: mlal2_5:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: dup v2.4s, v2.s[0]
; CHECK-GI-NEXT: umlal2 v0.2d, v1.4s, v2.4s
; CHECK-GI-NEXT: ret
%shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer
%tmp = bitcast <4 x i32> %b to <2 x i64>
%shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
%tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
%shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
%tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
%vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
%add = add <2 x i64> %vmull2.i.i, %a
ret <2 x i64> %add
}
; rdar://12328502
define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone ssp {
; CHECK-LABEL: vmulq_n_f64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: fmul v0.2d, v0.2d, v1.d[0]
; CHECK-NEXT: ret
entry:
%vecinit.i = insertelement <2 x double> undef, double %y, i32 0
%vecinit1.i = insertelement <2 x double> %vecinit.i, double %y, i32 1
%mul.i = fmul <2 x double> %vecinit1.i, %x
ret <2 x double> %mul.i
}
define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp {
; CHECK-LABEL: vmulq_n_f32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1
; CHECK-NEXT: fmul v0.4s, v0.4s, v1.s[0]
; CHECK-NEXT: ret
entry:
%vecinit.i = insertelement <4 x float> undef, float %y, i32 0
%vecinit1.i = insertelement <4 x float> %vecinit.i, float %y, i32 1
%vecinit2.i = insertelement <4 x float> %vecinit1.i, float %y, i32 2
%vecinit3.i = insertelement <4 x float> %vecinit2.i, float %y, i32 3
%mul.i = fmul <4 x float> %vecinit3.i, %x
ret <4 x float> %mul.i
}
define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp {
; CHECK-LABEL: vmul_n_f32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1
; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[0]
; CHECK-NEXT: ret
entry:
%vecinit.i = insertelement <2 x float> undef, float %y, i32 0
%vecinit1.i = insertelement <2 x float> %vecinit.i, float %y, i32 1
%mul.i = fmul <2 x float> %vecinit1.i, %x
ret <2 x float> %mul.i
}
define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
; CHECK-LABEL: vmla_laneq_s16_test:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mla v0.4h, v1.4h, v2.h[6]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
%mul = mul <4 x i16> %shuffle, %b
%add = add <4 x i16> %mul, %a
ret <4 x i16> %add
}
define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
; CHECK-LABEL: vmla_laneq_s32_test:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mla v0.2s, v1.2s, v2.s[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
%mul = mul <2 x i32> %shuffle, %b
%add = add <2 x i32> %mul, %a
ret <2 x i32> %add
}
define <8 x i16> @not_really_vmlaq_laneq_s16_test(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
; CHECK-SD-LABEL: not_really_vmlaq_laneq_s16_test:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mla v0.8h, v1.8h, v2.h[5]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: not_really_vmlaq_laneq_s16_test:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ext v2.16b, v2.16b, v0.16b, #8
; CHECK-GI-NEXT: mla v0.8h, v1.8h, v2.h[1]
; CHECK-GI-NEXT: ret
entry:
%shuffle1 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle2 = shufflevector <4 x i16> %shuffle1, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%mul = mul <8 x i16> %shuffle2, %b
%add = add <8 x i16> %mul, %a
ret <8 x i16> %add
}
define <4 x i32> @not_really_vmlaq_laneq_s32_test(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
; CHECK-SD-LABEL: not_really_vmlaq_laneq_s32_test:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mla v0.4s, v1.4s, v2.s[3]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: not_really_vmlaq_laneq_s32_test:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ext v2.16b, v2.16b, v0.16b, #8
; CHECK-GI-NEXT: mla v0.4s, v1.4s, v2.s[1]
; CHECK-GI-NEXT: ret
entry:
%shuffle1 = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle2 = shufflevector <2 x i32> %shuffle1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%mul = mul <4 x i32> %shuffle2, %b
%add = add <4 x i32> %mul, %a
ret <4 x i32> %add
}
define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
; CHECK-LABEL: vmull_laneq_s16_test:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: smull v0.4s, v0.4h, v1.h[6]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
ret <4 x i32> %vmull2.i
}
define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
; CHECK-LABEL: vmull_laneq_s32_test:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: smull v0.2d, v0.2s, v1.s[2]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
ret <2 x i64> %vmull2.i
}
define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
; CHECK-LABEL: vmull_laneq_u16_test:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: umull v0.4s, v0.4h, v1.h[6]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
ret <4 x i32> %vmull2.i
}
define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
; CHECK-LABEL: vmull_laneq_u32_test:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: umull v0.2d, v0.2s, v1.s[2]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
ret <2 x i64> %vmull2.i
}
define <4 x i32> @vmull_low_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
; CHECK-LABEL: vmull_low_n_s16_test:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: dup v0.4h, w0
; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h
; CHECK-NEXT: ret
entry:
%conv = trunc i32 %d to i16
%0 = bitcast <8 x i16> %b to <2 x i64>
%shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
%1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
%vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
%vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
%vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
%vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
%vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
ret <4 x i32> %vmull2.i.i
}
define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
; CHECK-SD-LABEL: vmull_high_n_s16_test:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: dup v0.8h, w0
; CHECK-SD-NEXT: smull2 v0.4s, v1.8h, v0.8h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: vmull_high_n_s16_test:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov d0, v1.d[1]
; CHECK-GI-NEXT: dup v1.4h, w0
; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h
; CHECK-GI-NEXT: ret
entry:
%conv = trunc i32 %d to i16
%0 = bitcast <8 x i16> %b to <2 x i64>
%shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
%1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
%vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
%vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
%vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
%vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
%vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
ret <4 x i32> %vmull2.i.i
}
define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
; CHECK-SD-LABEL: vmull_high_n_s32_test:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: dup v0.4s, w0
; CHECK-SD-NEXT: smull2 v0.2d, v1.4s, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: vmull_high_n_s32_test:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov d0, v1.d[1]
; CHECK-GI-NEXT: dup v1.2s, w0
; CHECK-GI-NEXT: smull v0.2d, v0.2s, v1.2s
; CHECK-GI-NEXT: ret
entry:
%0 = bitcast <4 x i32> %b to <2 x i64>
%shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
%1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
%vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
%vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
%vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
ret <2 x i64> %vmull2.i.i
}
define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
; CHECK-SD-LABEL: vmull_high_n_u16_test:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: dup v0.8h, w0
; CHECK-SD-NEXT: umull2 v0.4s, v1.8h, v0.8h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: vmull_high_n_u16_test:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov d0, v1.d[1]
; CHECK-GI-NEXT: dup v1.4h, w0
; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.4h
; CHECK-GI-NEXT: ret
entry:
%conv = trunc i32 %d to i16
%0 = bitcast <8 x i16> %b to <2 x i64>
%shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
%1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
%vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
%vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
%vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
%vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
%vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
ret <4 x i32> %vmull2.i.i
}
define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
; CHECK-SD-LABEL: vmull_high_n_u32_test:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: dup v0.4s, w0
; CHECK-SD-NEXT: umull2 v0.2d, v1.4s, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: vmull_high_n_u32_test:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov d0, v1.d[1]
; CHECK-GI-NEXT: dup v1.2s, w0
; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.2s
; CHECK-GI-NEXT: ret
entry:
%0 = bitcast <4 x i32> %b to <2 x i64>
%shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
%1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
%vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
%vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
%vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
ret <2 x i64> %vmull2.i.i
}
define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) {
; CHECK-SD-LABEL: vmul_built_dup_test:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: mul v0.4s, v0.4s, v1.s[1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: vmul_built_dup_test:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov s1, v1.s[1]
; CHECK-GI-NEXT: dup v1.4s, v1.s[0]
; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s
; CHECK-GI-NEXT: ret
%vget_lane = extractelement <4 x i32> %b, i32 1
%vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
%vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
%vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
%vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
%prod = mul <4 x i32> %a, %vecinit3.i
ret <4 x i32> %prod
}
define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) {
; CHECK-SD-LABEL: vmul_built_dup_fromsmall_test:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.h[3]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: vmul_built_dup_fromsmall_test:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: mov h1, v1.h[3]
; CHECK-GI-NEXT: dup v1.4h, v1.h[0]
; CHECK-GI-NEXT: mul v0.4h, v0.4h, v1.4h
; CHECK-GI-NEXT: ret
%vget_lane = extractelement <4 x i16> %b, i32 3
%vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
%vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
%vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
%vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
%prod = mul <4 x i16> %a, %vecinit3.i
ret <4 x i16> %prod
}
define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) {
; CHECK-SD-LABEL: vmulq_built_dup_fromsmall_test:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: mul v0.8h, v0.8h, v1.h[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: vmulq_built_dup_fromsmall_test:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: dup v1.8h, v1.h[0]
; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
; CHECK-GI-NEXT: ret
%vget_lane = extractelement <4 x i16> %b, i32 0
%vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
%vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
%vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
%vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
%vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
%vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
%vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
%vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
%prod = mul <8 x i16> %a, %vecinit7.i
ret <8 x i16> %prod
}
define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK-LABEL: mull_from_two_extracts:
; CHECK: // %bb.0:
; CHECK-NEXT: sqdmull2 v0.2d, v0.4s, v1.4s
; CHECK-NEXT: ret
%lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
ret <2 x i64> %res
}
define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK-LABEL: mlal_from_two_extracts:
; CHECK: // %bb.0:
; CHECK-NEXT: sqdmlal2 v0.2d, v1.4s, v2.4s
; CHECK-NEXT: ret
%lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
%sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
ret <2 x i64> %sum
}
define <2 x i64> @mull_from_extract_dup_low(<4 x i32> %lhs, i32 %rhs) {
; CHECK-LABEL: mull_from_extract_dup_low:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v1.2s, w0
; CHECK-NEXT: sqdmull v0.2d, v0.2s, v1.2s
; CHECK-NEXT: ret
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
%rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
%lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
ret <2 x i64> %res
}
define <2 x i64> @mull_from_extract_dup_high(<4 x i32> %lhs, i32 %rhs) {
; CHECK-SD-LABEL: mull_from_extract_dup_high:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: dup v1.4s, w0
; CHECK-SD-NEXT: sqdmull2 v0.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: mull_from_extract_dup_high:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: dup v1.2s, w0
; CHECK-GI-NEXT: mov d0, v0.d[1]
; CHECK-GI-NEXT: sqdmull v0.2d, v0.2s, v1.2s
; CHECK-GI-NEXT: ret
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
%rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
%lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
ret <2 x i64> %res
}
define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) {
; CHECK-LABEL: pmull_from_extract_dup_low:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v1.8b, w0
; CHECK-NEXT: pmull v0.8h, v0.8b, v1.8b
; CHECK-NEXT: ret
%rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
%rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
%lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
ret <8 x i16> %res
}
define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) {
; CHECK-SD-LABEL: pmull_from_extract_dup_high:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: dup v1.16b, w0
; CHECK-SD-NEXT: pmull2 v0.8h, v0.16b, v1.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: pmull_from_extract_dup_high:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: dup v1.8b, w0
; CHECK-GI-NEXT: mov d0, v0.d[1]
; CHECK-GI-NEXT: pmull v0.8h, v0.8b, v1.8b
; CHECK-GI-NEXT: ret
%rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
%rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
%lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
ret <8 x i16> %res
}
define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs) {
; CHECK-LABEL: pmull_from_extract_duplane_low:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: dup v1.8b, v1.b[0]
; CHECK-NEXT: pmull v0.8h, v0.8b, v1.8b
; CHECK-NEXT: ret
%lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
%res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
ret <8 x i16> %res
}
define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) {
; CHECK-SD-LABEL: pmull_from_extract_duplane_high:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: dup v1.16b, v1.b[0]
; CHECK-SD-NEXT: pmull2 v0.8h, v0.16b, v1.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: pmull_from_extract_duplane_high:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: mov d0, v0.d[1]
; CHECK-GI-NEXT: dup v1.8b, v1.b[0]
; CHECK-GI-NEXT: pmull v0.8h, v0.8b, v1.8b
; CHECK-GI-NEXT: ret
%lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
%res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
ret <8 x i16> %res
}
define <2 x i64> @sqdmull_from_extract_duplane_low(<4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK-LABEL: sqdmull_from_extract_duplane_low:
; CHECK: // %bb.0:
; CHECK-NEXT: sqdmull v0.2d, v0.2s, v1.s[0]
; CHECK-NEXT: ret
%lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
%res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
ret <2 x i64> %res
}
define <2 x i64> @sqdmull_from_extract_duplane_high(<4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK-SD-LABEL: sqdmull_from_extract_duplane_high:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: sqdmull2 v0.2d, v0.4s, v1.s[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmull_from_extract_duplane_high:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov d0, v0.d[1]
; CHECK-GI-NEXT: sqdmull v0.2d, v0.2s, v1.s[0]
; CHECK-GI-NEXT: ret
%lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
%res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
ret <2 x i64> %res
}
define <2 x i64> @sqdmlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK-LABEL: sqdmlal_from_extract_duplane_low:
; CHECK: // %bb.0:
; CHECK-NEXT: sqdmlal v0.2d, v1.2s, v2.s[0]
; CHECK-NEXT: ret
%lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
%res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
%sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
ret <2 x i64> %sum
}
define <2 x i64> @sqdmlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK-SD-LABEL: sqdmlal_from_extract_duplane_high:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: sqdmlal2 v0.2d, v1.4s, v2.s[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmlal_from_extract_duplane_high:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov d1, v1.d[1]
; CHECK-GI-NEXT: sqdmlal v0.2d, v1.2s, v2.s[0]
; CHECK-GI-NEXT: ret
%lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
%res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
%sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
ret <2 x i64> %sum
}
define <2 x i64> @umlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK-LABEL: umlal_from_extract_duplane_low:
; CHECK: // %bb.0:
; CHECK-NEXT: umlal v0.2d, v1.2s, v2.s[0]
; CHECK-NEXT: ret
%lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
%res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
%sum = add <2 x i64> %accum, %res
ret <2 x i64> %sum
}
define <2 x i64> @umlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK-SD-LABEL: umlal_from_extract_duplane_high:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: umlal2 v0.2d, v1.4s, v2.s[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: umlal_from_extract_duplane_high:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov d1, v1.d[1]
; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.s[0]
; CHECK-GI-NEXT: ret
%lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
%res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
%sum = add <2 x i64> %accum, %res
ret <2 x i64> %sum
}
define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
; CHECK-LABEL: scalar_fmla_from_extract_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fmla s0, s1, v2.s[3]
; CHECK-NEXT: ret
%rhs = extractelement <4 x float> %rvec, i32 3
%res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
ret float %res
}
define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
; CHECK-SD-LABEL: scalar_fmla_from_extract_v2f32:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-SD-NEXT: fmla s0, s1, v2.s[1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: scalar_fmla_from_extract_v2f32:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: mov s2, v2.s[1]
; CHECK-GI-NEXT: fmadd s0, s1, s2, s0
; CHECK-GI-NEXT: ret
%rhs = extractelement <2 x float> %rvec, i32 1
%res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
ret float %res
}
define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
; CHECK-LABEL: scalar_fmls_from_extract_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fmls s0, s1, v2.s[3]
; CHECK-NEXT: ret
%rhs.scal = extractelement <4 x float> %rvec, i32 3
%rhs = fsub float -0.0, %rhs.scal
%res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
ret float %res
}
define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
; CHECK-LABEL: scalar_fmls_from_extract_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: fmls s0, s1, v2.s[1]
; CHECK-NEXT: ret
%rhs.scal = extractelement <2 x float> %rvec, i32 1
%rhs = fsub float -0.0, %rhs.scal
%res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
ret float %res
}
declare float @llvm.fma.f32(float, float, float)
define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
; CHECK-LABEL: scalar_fmla_from_extract_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: fmla d0, d1, v2.d[1]
; CHECK-NEXT: ret
%rhs = extractelement <2 x double> %rvec, i32 1
%res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
ret double %res
}
define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
; CHECK-LABEL: scalar_fmls_from_extract_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: fmls d0, d1, v2.d[1]
; CHECK-NEXT: ret
%rhs.scal = extractelement <2 x double> %rvec, i32 1
%rhs = fsub double -0.0, %rhs.scal
%res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
ret double %res
}
declare double @llvm.fma.f64(double, double, double)
define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) {
; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fmls v0.2s, v1.2s, v2.s[3]
; CHECK-NEXT: ret
%rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
%splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> <i32 3, i32 3>
%res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
ret <2 x float> %res
}
define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) {
; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32_1:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: fmls v0.2s, v1.2s, v2.s[1]
; CHECK-NEXT: ret
%rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
%splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> <i32 1, i32 1>
%res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
ret <2 x float> %res
}
define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) {
; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fmls v0.4s, v1.4s, v2.s[3]
; CHECK-NEXT: ret
%rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
%splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
ret <4 x float> %res
}
define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 x float> %lhs, <2 x float> %rhs) {
; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32_1:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: fmls v0.4s, v1.4s, v2.s[1]
; CHECK-NEXT: ret
%rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
%splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
ret <4 x float> %res
}
define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) {
; CHECK-LABEL: fmls_with_fneg_before_extract_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: fmls v0.2d, v1.2d, v2.d[1]
; CHECK-NEXT: ret
%rhs_neg = fsub <2 x double> <double -0.0, double -0.0>, %rhs
%splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> <i32 1, i32 1>
%res = call <2 x double> @llvm.fma.v2f64(<2 x double> %lhs, <2 x double> %splat, <2 x double> %accum)
ret <2 x double> %res
}
define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
; CHECK-LABEL: test_fmul_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: fmul d0, d0, d1
; CHECK-NEXT: ret
%prod = fmul <1 x double> %L, %R
ret <1 x double> %prod
}
define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
; CHECK-LABEL: test_fdiv_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: fdiv d0, d0, d1
; CHECK-NEXT: ret
%prod = fdiv <1 x double> %L, %R
ret <1 x double> %prod
}
define i32 @sqdmlal_s(i16 %A, i16 %B, i32 %C) nounwind {
; CHECK-SD-LABEL: sqdmlal_s:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: fmov s0, w2
; CHECK-SD-NEXT: fmov s1, w0
; CHECK-SD-NEXT: fmov s2, w1
; CHECK-SD-NEXT: sqdmlal s0, h1, v2.h[0]
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmlal_s:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov s0, w0
; CHECK-GI-NEXT: fmov s1, w1
; CHECK-GI-NEXT: fmov s2, w2
; CHECK-GI-NEXT: sqdmlal s2, h0, v1.h[0]
; CHECK-GI-NEXT: fmov w0, s2
; CHECK-GI-NEXT: ret
%tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0
%tmp2 = insertelement <4 x i16> undef, i16 %B, i64 0
%tmp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
%tmp4 = extractelement <4 x i32> %tmp3, i64 0
%tmp5 = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %C, i32 %tmp4)
ret i32 %tmp5
}
define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
; CHECK-LABEL: sqdmlal_d:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s0, w1
; CHECK-NEXT: fmov s1, w0
; CHECK-NEXT: fmov d2, x2
; CHECK-NEXT: sqdmlal d2, s1, s0
; CHECK-NEXT: fmov x0, d2
; CHECK-NEXT: ret
%tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
%tmp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %tmp4)
ret i64 %tmp5
}
define i32 @sqdmlsl_s(i16 %A, i16 %B, i32 %C) nounwind {
; CHECK-SD-LABEL: sqdmlsl_s:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: fmov s0, w2
; CHECK-SD-NEXT: fmov s1, w0
; CHECK-SD-NEXT: fmov s2, w1
; CHECK-SD-NEXT: sqdmlsl s0, h1, v2.h[0]
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmlsl_s:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov s0, w0
; CHECK-GI-NEXT: fmov s1, w1
; CHECK-GI-NEXT: fmov s2, w2
; CHECK-GI-NEXT: sqdmlsl s2, h0, v1.h[0]
; CHECK-GI-NEXT: fmov w0, s2
; CHECK-GI-NEXT: ret
%tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0
%tmp2 = insertelement <4 x i16> undef, i16 %B, i64 0
%tmp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
%tmp4 = extractelement <4 x i32> %tmp3, i64 0
%tmp5 = tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 %C, i32 %tmp4)
ret i32 %tmp5
}
define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
; CHECK-LABEL: sqdmlsl_d:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s0, w1
; CHECK-NEXT: fmov s1, w0
; CHECK-NEXT: fmov d2, x2
; CHECK-NEXT: sqdmlsl d2, s1, s0
; CHECK-NEXT: fmov x0, d2
; CHECK-NEXT: ret
%tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
%tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4)
ret i64 %tmp5
}
define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
; CHECK-SD-LABEL: test_pmull_64:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: fmov d0, x1
; CHECK-SD-NEXT: fmov d1, x0
; CHECK-SD-NEXT: pmull v0.1q, v1.1d, v0.1d
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_pmull_64:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov d0, x0
; CHECK-GI-NEXT: fmov d1, x1
; CHECK-GI-NEXT: pmull v0.1q, v0.1d, v1.1d
; CHECK-GI-NEXT: ret
%val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
ret <16 x i8> %val
}
define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
; CHECK-SD-LABEL: test_pmull_high_64:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: pmull2 v0.1q, v0.2d, v1.2d
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_pmull_high_64:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov d0, v0.d[1]
; CHECK-GI-NEXT: mov d1, v1.d[1]
; CHECK-GI-NEXT: pmull v0.1q, v0.1d, v1.1d
; CHECK-GI-NEXT: ret
%l_hi = extractelement <2 x i64> %l, i32 1
%r_hi = extractelement <2 x i64> %r, i32 1
%val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi)
ret <16 x i8> %val
}
define <16 x i8> @test_commutable_pmull_64(i64 %l, i64 %r) nounwind {
; CHECK-SD-LABEL: test_commutable_pmull_64:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: fmov d0, x1
; CHECK-SD-NEXT: fmov d1, x0
; CHECK-SD-NEXT: pmull v0.1q, v1.1d, v0.1d
; CHECK-SD-NEXT: add v0.16b, v0.16b, v0.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_commutable_pmull_64:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov d0, x0
; CHECK-GI-NEXT: fmov d1, x1
; CHECK-GI-NEXT: pmull v2.1q, v0.1d, v1.1d
; CHECK-GI-NEXT: pmull v0.1q, v1.1d, v0.1d
; CHECK-GI-NEXT: add v0.16b, v2.16b, v0.16b
; CHECK-GI-NEXT: ret
%1 = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
%2 = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %r, i64 %l)
%3 = add <16 x i8> %1, %2
ret <16 x i8> %3
}
declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)
define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind {
; CHECK-SD-LABEL: test_mul_v1i64:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: fmov x8, d1
; CHECK-SD-NEXT: fmov x9, d0
; CHECK-SD-NEXT: mul x8, x9, x8
; CHECK-SD-NEXT: fmov d0, x8
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_mul_v1i64:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: mul x8, x8, x9
; CHECK-GI-NEXT: fmov d0, x8
; CHECK-GI-NEXT: ret
%prod = mul <1 x i64> %lhs, %rhs
ret <1 x i64> %prod
}
define <4 x i32> @sqdmlal4s_lib(<4 x i32> %dst, <4 x i16> %v1, <4 x i16> %v2) {
; CHECK-LABEL: sqdmlal4s_lib:
; CHECK: // %bb.0:
; CHECK-NEXT: sqdmlal v0.4s, v1.4h, v2.4h
; CHECK-NEXT: ret
%tmp = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %v1, <4 x i16> %v2)
%sum = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp)
ret <4 x i32> %sum
}
define <2 x i64> @sqdmlal2d_lib(<2 x i64> %dst, <2 x i32> %v1, <2 x i32> %v2) {
; CHECK-LABEL: sqdmlal2d_lib:
; CHECK: // %bb.0:
; CHECK-NEXT: sqdmlal v0.2d, v1.2s, v2.2s
; CHECK-NEXT: ret
%tmp = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %v1, <2 x i32> %v2)
%sum = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp)
ret <2 x i64> %sum
}
define <4 x i32> @sqdmlal2_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> %v2) {
; CHECK-LABEL: sqdmlal2_4s_lib:
; CHECK: // %bb.0:
; CHECK-NEXT: sqdmlal2 v0.4s, v1.8h, v2.8h
; CHECK-NEXT: ret
%tmp0 = shufflevector <8 x i16> %v1, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%tmp1 = shufflevector <8 x i16> %v2, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%tmp2 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp0, <4 x i16> %tmp1)
%sum = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp2)
ret <4 x i32> %sum
}
define <2 x i64> @sqdmlal2_2d_lib(<2 x i64> %dst, <4 x i32> %v1, <4 x i32> %v2) {
; CHECK-LABEL: sqdmlal2_2d_lib:
; CHECK: // %bb.0:
; CHECK-NEXT: sqdmlal2 v0.2d, v1.4s, v2.4s
; CHECK-NEXT: ret
%tmp0 = shufflevector <4 x i32> %v1, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
%tmp1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
%tmp2 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp0, <2 x i32> %tmp1)
%sum = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp2)
ret <2 x i64> %sum
}
define <4 x i32> @sqdmlal_lane_4s_lib(<4 x i32> %dst, <4 x i16> %v1, <4 x i16> %v2) {
; CHECK-LABEL: sqdmlal_lane_4s_lib:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: sqdmlal v0.4s, v1.4h, v2.h[3]
; CHECK-NEXT: ret
%tmp0 = shufflevector <4 x i16> %v2, <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%tmp1 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %v1, <4 x i16> %tmp0)
%sum = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp1)
ret <4 x i32> %sum
}
define <2 x i64> @sqdmlal_lane_2d_lib(<2 x i64> %dst, <2 x i32> %v1, <2 x i32> %v2) {
; CHECK-LABEL: sqdmlal_lane_2d_lib:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: sqdmlal v0.2d, v1.2s, v2.s[1]
; CHECK-NEXT: ret
%tmp0 = shufflevector <2 x i32> %v2, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp1 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %v1, <2 x i32> %tmp0)
%sum = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp1)
ret <2 x i64> %sum
}
define <4 x i32> @sqdmlal2_lane_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> %v2) {
; CHECK-SD-LABEL: sqdmlal2_lane_4s_lib:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: sqdmlal2 v0.4s, v1.8h, v2.h[7]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmlal2_lane_4s_lib:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov d1, v1.d[1]
; CHECK-GI-NEXT: sqdmlal v0.4s, v1.4h, v2.h[7]
; CHECK-GI-NEXT: ret
%tmp0 = shufflevector <8 x i16> %v1, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%tmp1 = shufflevector <8 x i16> %v2, <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%tmp2 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp0, <4 x i16> %tmp1)
%sum = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp2)
ret <4 x i32> %sum
}
define <2 x i64> @sqdmlal2_lane_2d_lib(<2 x i64> %dst, <4 x i32> %v1, <4 x i32> %v2) {
; CHECK-SD-LABEL: sqdmlal2_lane_2d_lib:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: sqdmlal2 v0.2d, v1.4s, v2.s[1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmlal2_lane_2d_lib:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov d1, v1.d[1]
; CHECK-GI-NEXT: sqdmlal v0.2d, v1.2s, v2.s[1]
; CHECK-GI-NEXT: ret
%tmp0 = shufflevector <4 x i32> %v1, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
%tmp1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp2 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp0, <2 x i32> %tmp1)
%sum = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp2)
ret <2 x i64> %sum
}
define <4 x i32> @sqdmlsl4s_lib(<4 x i32> %dst, <4 x i16> %v1, <4 x i16> %v2) {
; CHECK-LABEL: sqdmlsl4s_lib:
; CHECK: // %bb.0:
; CHECK-NEXT: sqdmlsl v0.4s, v1.4h, v2.4h
; CHECK-NEXT: ret
%tmp = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %v1, <4 x i16> %v2)
%sum = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp)
ret <4 x i32> %sum
}
define <2 x i64> @sqdmlsl2d_lib(<2 x i64> %dst, <2 x i32> %v1, <2 x i32> %v2) {
; CHECK-LABEL: sqdmlsl2d_lib:
; CHECK: // %bb.0:
; CHECK-NEXT: sqdmlsl v0.2d, v1.2s, v2.2s
; CHECK-NEXT: ret
%tmp = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %v1, <2 x i32> %v2)
%sum = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp)
ret <2 x i64> %sum
}
define <4 x i32> @sqdmlsl2_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> %v2) {
; CHECK-LABEL: sqdmlsl2_4s_lib:
; CHECK: // %bb.0:
; CHECK-NEXT: sqdmlsl2 v0.4s, v1.8h, v2.8h
; CHECK-NEXT: ret
%tmp0 = shufflevector <8 x i16> %v1, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%tmp1 = shufflevector <8 x i16> %v2, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%tmp2 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp0, <4 x i16> %tmp1)
%sum = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp2)
ret <4 x i32> %sum
}
define <2 x i64> @sqdmlsl2_2d_lib(<2 x i64> %dst, <4 x i32> %v1, <4 x i32> %v2) {
; CHECK-LABEL: sqdmlsl2_2d_lib:
; CHECK: // %bb.0:
; CHECK-NEXT: sqdmlsl2 v0.2d, v1.4s, v2.4s
; CHECK-NEXT: ret
%tmp0 = shufflevector <4 x i32> %v1, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
%tmp1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
%tmp2 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp0, <2 x i32> %tmp1)
%sum = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp2)
ret <2 x i64> %sum
}
define <4 x i32> @sqdmlsl_lane_4s_lib(<4 x i32> %dst, <4 x i16> %v1, <4 x i16> %v2) {
; CHECK-LABEL: sqdmlsl_lane_4s_lib:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: sqdmlsl v0.4s, v1.4h, v2.h[3]
; CHECK-NEXT: ret
%tmp0 = shufflevector <4 x i16> %v2, <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%tmp1 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %v1, <4 x i16> %tmp0)
%sum = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp1)
ret <4 x i32> %sum
}
define <2 x i64> @sqdmlsl_lane_2d_lib(<2 x i64> %dst, <2 x i32> %v1, <2 x i32> %v2) {
; CHECK-LABEL: sqdmlsl_lane_2d_lib:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: sqdmlsl v0.2d, v1.2s, v2.s[1]
; CHECK-NEXT: ret
%tmp0 = shufflevector <2 x i32> %v2, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp1 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %v1, <2 x i32> %tmp0)
%sum = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp1)
ret <2 x i64> %sum
}
define <4 x i32> @sqdmlsl2_lane_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> %v2) {
; CHECK-SD-LABEL: sqdmlsl2_lane_4s_lib:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: sqdmlsl2 v0.4s, v1.8h, v2.h[7]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmlsl2_lane_4s_lib:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov d1, v1.d[1]
; CHECK-GI-NEXT: sqdmlsl v0.4s, v1.4h, v2.h[7]
; CHECK-GI-NEXT: ret
%tmp0 = shufflevector <8 x i16> %v1, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%tmp1 = shufflevector <8 x i16> %v2, <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%tmp2 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp0, <4 x i16> %tmp1)
%sum = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp2)
ret <4 x i32> %sum
}
define <2 x i64> @sqdmlsl2_lane_2d_lib(<2 x i64> %dst, <4 x i32> %v1, <4 x i32> %v2) {
; CHECK-SD-LABEL: sqdmlsl2_lane_2d_lib:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: sqdmlsl2 v0.2d, v1.4s, v2.s[1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sqdmlsl2_lane_2d_lib:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov d1, v1.d[1]
; CHECK-GI-NEXT: sqdmlsl v0.2d, v1.2s, v2.s[1]
; CHECK-GI-NEXT: ret
%tmp0 = shufflevector <4 x i32> %v1, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
%tmp1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp2 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp0, <2 x i32> %tmp1)
%sum = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp2)
ret <2 x i64> %sum
}