blob: eafa44a35d024b9ce19e729628aa6bbdc88feedd [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
define void @fmul_indexed_f16_256b(ptr %a, ptr %b, ptr %c) #0 {
; CHECK-LABEL: fmul_indexed_f16_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: ldr z1, [x1]
; CHECK-NEXT: fmul z0.h, z1.h, z0.h[2]
; CHECK-NEXT: str z0, [x2]
; CHECK-NEXT: ret
%ld.a = load <16 x half>, ptr %a
%ld.b = load <16 x half>, ptr %b
%splat.lanes = shufflevector <16 x half> %ld.a, <16 x half> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
%res = fmul <16 x half> %ld.b, %splat.lanes
store <16 x half> %res, ptr %c
ret void
}
define void @fmul_indexed_bf16_256b(ptr %a, ptr %b, ptr %c) #0 {
; CHECK-LABEL: fmul_indexed_bf16_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: ldp q2, q3, [x1]
; CHECK-NEXT: dup v0.8h, v0.h[2]
; CHECK-NEXT: dup v1.8h, v1.h[2]
; CHECK-NEXT: bfmul z0.h, p0/m, z0.h, z2.h
; CHECK-NEXT: bfmul z1.h, p0/m, z1.h, z3.h
; CHECK-NEXT: stp q0, q1, [x2]
; CHECK-NEXT: ret
%ld.a = load <16 x bfloat>, ptr %a
%ld.b = load <16 x bfloat>, ptr %b
%splat.lanes = shufflevector <16 x bfloat> %ld.a, <16 x bfloat> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
%res = fmul <16 x bfloat> %ld.b, %splat.lanes
store <16 x bfloat> %res, ptr %c
ret void
}
define void @fmul_indexed_f32_256b(ptr %a, ptr %b, ptr %c) #0 {
; CHECK-LABEL: fmul_indexed_f32_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: ldr z1, [x1]
; CHECK-NEXT: fmul z0.s, z1.s, z0.s[3]
; CHECK-NEXT: str z0, [x2]
; CHECK-NEXT: ret
%ld.a = load <8 x float>, ptr %a
%ld.b = load <8 x float>, ptr %b
%splat.lanes = shufflevector <8 x float> %ld.a, <8 x float> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
i32 7, i32 7, i32 7, i32 7>
%res = fmul <8 x float> %splat.lanes, %ld.b
store <8 x float> %res, ptr %c
ret void
}
define void @fmul_indexed_f64_256b_trn1(ptr %a, ptr %b, ptr %c) #0 {
; CHECK-LABEL: fmul_indexed_f64_256b_trn1:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: ldr z1, [x1]
; CHECK-NEXT: fmul z0.d, z1.d, z0.d[0]
; CHECK-NEXT: str z0, [x2]
; CHECK-NEXT: ret
%ld.a = load <4 x double>, ptr %a
%ld.b = load <4 x double>, ptr %b
%splat.lanes = shufflevector <4 x double> %ld.a, <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
%res = fmul <4 x double> %splat.lanes, %ld.b
store <4 x double> %res, ptr %c
ret void
}
define void @fmul_indexed_f64_256b_trn2(ptr %a, ptr %b, ptr %c) #0 {
; CHECK-LABEL: fmul_indexed_f64_256b_trn2:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: ldr z1, [x1]
; CHECK-NEXT: fmul z0.d, z1.d, z0.d[1]
; CHECK-NEXT: str z0, [x2]
; CHECK-NEXT: ret
%ld.a = load <4 x double>, ptr %a
%ld.b = load <4 x double>, ptr %b
%splat.lanes = shufflevector <4 x double> %ld.a, <4 x double> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
%res = fmul <4 x double> %ld.b, %splat.lanes
store <4 x double> %res, ptr %c
ret void
}
define void @fmla_indexed_f16_256b(ptr %a, ptr %b, ptr %c) #0 {
; CHECK-LABEL: fmla_indexed_f16_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: ldr z1, [x1]
; CHECK-NEXT: ldr z2, [x2]
; CHECK-NEXT: fmla z2.h, z1.h, z0.h[2]
; CHECK-NEXT: str z2, [x2]
; CHECK-NEXT: ret
%ld.a = load <16 x half>, ptr %a
%ld.b = load <16 x half>, ptr %b
%ld.c = load <16 x half>, ptr %c
%splat.lanes = shufflevector <16 x half> %ld.a, <16 x half> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
%res = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> %ld.b, <16 x half> %splat.lanes, <16 x half> %ld.c)
store <16 x half> %res, ptr %c
ret void
}
define void @fmla_indexed_bf16_256b(ptr %a, ptr %b, ptr %c) #0 {
; CHECK-LABEL: fmla_indexed_bf16_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: ldp q2, q3, [x1]
; CHECK-NEXT: ldp q4, q5, [x2]
; CHECK-NEXT: dup v0.8h, v0.h[2]
; CHECK-NEXT: dup v1.8h, v1.h[2]
; CHECK-NEXT: bfmul z0.h, p0/m, z0.h, z2.h
; CHECK-NEXT: bfmul z1.h, p0/m, z1.h, z3.h
; CHECK-NEXT: bfadd z0.h, p0/m, z0.h, z4.h
; CHECK-NEXT: bfadd z1.h, p0/m, z1.h, z5.h
; CHECK-NEXT: stp q0, q1, [x2]
; CHECK-NEXT: ret
%ld.a = load <16 x bfloat>, ptr %a
%ld.b = load <16 x bfloat>, ptr %b
%ld.c = load <16 x bfloat>, ptr %c
%splat.lanes = shufflevector <16 x bfloat> %ld.a, <16 x bfloat> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
%res = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> %ld.b, <16 x bfloat> %splat.lanes, <16 x bfloat> %ld.c)
store <16 x bfloat> %res, ptr %c
ret void
}
define void @fmla_indexed_f32_256b(ptr %a, ptr %b, ptr %c) #0 {
; CHECK-LABEL: fmla_indexed_f32_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: ldr z1, [x1]
; CHECK-NEXT: ldr z2, [x2]
; CHECK-NEXT: fmla z2.s, z1.s, z0.s[3]
; CHECK-NEXT: str z2, [x2]
; CHECK-NEXT: ret
%ld.a = load <8 x float>, ptr %a
%ld.b = load <8 x float>, ptr %b
%ld.c = load <8 x float>, ptr %c
%splat.lanes = shufflevector <8 x float> %ld.a, <8 x float> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
i32 7, i32 7, i32 7, i32 7>
%res = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %splat.lanes, <8 x float> %ld.b, <8 x float> %ld.c)
store <8 x float> %res, ptr %c
ret void
}
define void @fmla_indexed_f64_256b_trn1(ptr %a, ptr %b, ptr %c) #0 {
; CHECK-LABEL: fmla_indexed_f64_256b_trn1:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: ldr z1, [x1]
; CHECK-NEXT: ldr z2, [x2]
; CHECK-NEXT: fmla z2.d, z1.d, z0.d[0]
; CHECK-NEXT: str z2, [x2]
; CHECK-NEXT: ret
%ld.a = load <4 x double>, ptr %a
%ld.b = load <4 x double>, ptr %b
%ld.c = load <4 x double>, ptr %c
%splat.lanes = shufflevector <4 x double> %ld.a, <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
%res = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %splat.lanes, <4 x double> %ld.b, <4 x double> %ld.c)
store <4 x double> %res, ptr %c
ret void
}
define void @fmla_indexed_f64_256b_trn2(ptr %a, ptr %b, ptr %c) #0 {
; CHECK-LABEL: fmla_indexed_f64_256b_trn2:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: ldr z1, [x1]
; CHECK-NEXT: ldr z2, [x2]
; CHECK-NEXT: fmla z2.d, z1.d, z0.d[1]
; CHECK-NEXT: str z2, [x2]
; CHECK-NEXT: ret
%ld.a = load <4 x double>, ptr %a
%ld.b = load <4 x double>, ptr %b
%ld.c = load <4 x double>, ptr %c
%splat.lanes = shufflevector <4 x double> %ld.a, <4 x double> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
%res = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %ld.b, <4 x double> %splat.lanes, <4 x double> %ld.c)
store <4 x double> %res, ptr %c
ret void
}
define void @fmls_indexed_f16_256b(ptr %a, ptr %b, ptr %c) #0 {
; CHECK-LABEL: fmls_indexed_f16_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: ldr z1, [x1]
; CHECK-NEXT: ldr z2, [x2]
; CHECK-NEXT: fmls z2.h, z1.h, z0.h[2]
; CHECK-NEXT: str z2, [x2]
; CHECK-NEXT: ret
%ld.a = load <16 x half>, ptr %a
%ld.b = load <16 x half>, ptr %b
%ld.c = load <16 x half>, ptr %c
%splat.lanes = shufflevector <16 x half> %ld.a, <16 x half> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
%neg.b = fneg <16 x half> %ld.b
%res = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> %neg.b, <16 x half> %splat.lanes, <16 x half> %ld.c)
store <16 x half> %res, ptr %c
ret void
}
define void @fmls_indexed_bf16_256b(ptr %a, ptr %b, ptr %c) #0 {
; CHECK-LABEL: fmls_indexed_bf16_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: ldp q2, q3, [x1]
; CHECK-NEXT: ldp q4, q5, [x2]
; CHECK-NEXT: dup v0.8h, v0.h[2]
; CHECK-NEXT: dup v1.8h, v1.h[2]
; CHECK-NEXT: bfmul z0.h, p0/m, z0.h, z2.h
; CHECK-NEXT: bfmul z1.h, p0/m, z1.h, z3.h
; CHECK-NEXT: bfsub z0.h, p0/m, z0.h, z4.h
; CHECK-NEXT: bfsub z1.h, p0/m, z1.h, z5.h
; CHECK-NEXT: stp q0, q1, [x2]
; CHECK-NEXT: ret
%ld.a = load <16 x bfloat>, ptr %a
%ld.b = load <16 x bfloat>, ptr %b
%ld.c = load <16 x bfloat>, ptr %c
%splat.lanes = shufflevector <16 x bfloat> %ld.a, <16 x bfloat> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
%neg.b = fneg <16 x bfloat> %ld.b
%res = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> %neg.b, <16 x bfloat> %splat.lanes, <16 x bfloat> %ld.c)
store <16 x bfloat> %res, ptr %c
ret void
}
define void @fmls_indexed_f32_256b(ptr %a, ptr %b, ptr %c) #0 {
; CHECK-LABEL: fmls_indexed_f32_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: ldr z1, [x1]
; CHECK-NEXT: ldr z2, [x2]
; CHECK-NEXT: fmls z2.s, z1.s, z0.s[3]
; CHECK-NEXT: str z2, [x2]
; CHECK-NEXT: ret
%ld.a = load <8 x float>, ptr %a
%ld.b = load <8 x float>, ptr %b
%ld.c = load <8 x float>, ptr %c
%splat.lanes = shufflevector <8 x float> %ld.a, <8 x float> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
i32 7, i32 7, i32 7, i32 7>
%neg.b = fneg <8 x float> %ld.b
%res = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %splat.lanes, <8 x float> %neg.b, <8 x float> %ld.c)
store <8 x float> %res, ptr %c
ret void
}
define void @fmls_indexed_f64_256b_trn1(ptr %a, ptr %b, ptr %c) #0 {
; CHECK-LABEL: fmls_indexed_f64_256b_trn1:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: ldr z1, [x1]
; CHECK-NEXT: ldr z2, [x2]
; CHECK-NEXT: fmls z2.d, z1.d, z0.d[0]
; CHECK-NEXT: str z2, [x2]
; CHECK-NEXT: ret
%ld.a = load <4 x double>, ptr %a
%ld.b = load <4 x double>, ptr %b
%ld.c = load <4 x double>, ptr %c
%splat.lanes = shufflevector <4 x double> %ld.a, <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
%neg.b = fneg <4 x double> %ld.b
%res = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %splat.lanes, <4 x double> %neg.b, <4 x double> %ld.c)
store <4 x double> %res, ptr %c
ret void
}
define void @fmls_indexed_f64_256b_trn2(ptr %a, ptr %b, ptr %c) #0 {
; CHECK-LABEL: fmls_indexed_f64_256b_trn2:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: ldr z1, [x1]
; CHECK-NEXT: ldr z2, [x2]
; CHECK-NEXT: fmls z2.d, z1.d, z0.d[1]
; CHECK-NEXT: str z2, [x2]
; CHECK-NEXT: ret
%ld.a = load <4 x double>, ptr %a
%ld.b = load <4 x double>, ptr %b
%ld.c = load <4 x double>, ptr %c
%splat.lanes = shufflevector <4 x double> %ld.a, <4 x double> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
%neg.b = fneg <4 x double> %ld.b
%res = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %neg.b, <4 x double> %splat.lanes, <4 x double> %ld.c)
store <4 x double> %res, ptr %c
ret void
}
declare <16 x half> @llvm.fmuladd.v16f16(<16 x half>, <16 x half>, <16 x half>);
declare <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat>, <16 x bfloat>, <16 x bfloat>);
declare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>);
declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>);
attributes #0 = { noinline vscale_range(2,2) "target-features"="+sve2p1,+bf16,+sve-b16b16" }