[LLVM][CodeGen][SVE] Implement isel for maximumnum/minimumnum. (#185074)
Patch to add custom lowering for FCANONICALIZE, FMAXNUM_IEEE, and
FMINNUM_IEEE, all of which are required when relying on default
expansion of FMAXIMUMNUM and FMINIMUMNUM.
The lowering is very simple because AArch64's FMAXNM and FMINNM
instructions are IEEE754-2008 compliant, with the implementation
effectively follow the same path take for NEON.
NOTE: Bfloat support will be provided separately.diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index fa8c19d..d1212f9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1761,13 +1761,16 @@
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::FADD, VT, Custom);
+ setOperationAction(ISD::FCANONICALIZE, VT, Custom);
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
setOperationAction(ISD::FDIV, VT, Custom);
setOperationAction(ISD::FMA, VT, Custom);
setOperationAction(ISD::FMAXIMUM, VT, Custom);
setOperationAction(ISD::FMAXNUM, VT, Custom);
+ setOperationAction(ISD::FMAXNUM_IEEE, VT, Custom);
setOperationAction(ISD::FMINIMUM, VT, Custom);
setOperationAction(ISD::FMINNUM, VT, Custom);
+ setOperationAction(ISD::FMINNUM_IEEE, VT, Custom);
setOperationAction(ISD::FMUL, VT, Custom);
setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FSUB, VT, Custom);
@@ -2457,6 +2460,7 @@
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Default);
setOperationAction(ISD::FABS, VT, Default);
setOperationAction(ISD::FADD, VT, Default);
+ setOperationAction(ISD::FCANONICALIZE, VT, Default);
setOperationAction(ISD::FCEIL, VT, Default);
setOperationAction(ISD::FCOPYSIGN, VT, Default);
setOperationAction(ISD::FDIV, VT, Default);
@@ -2464,8 +2468,10 @@
setOperationAction(ISD::FMA, VT, Default);
setOperationAction(ISD::FMAXIMUM, VT, Default);
setOperationAction(ISD::FMAXNUM, VT, Default);
+ setOperationAction(ISD::FMAXNUM_IEEE, VT, Default);
setOperationAction(ISD::FMINIMUM, VT, Default);
setOperationAction(ISD::FMINNUM, VT, Default);
+ setOperationAction(ISD::FMINNUM_IEEE, VT, Default);
setOperationAction(ISD::FMUL, VT, Default);
setOperationAction(ISD::FNEARBYINT, VT, Default);
setOperationAction(ISD::FNEG, VT, Default);
@@ -8393,10 +8399,12 @@
case ISD::FMAXIMUM:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
case ISD::FMAXNUM:
+ case ISD::FMAXNUM_IEEE:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
case ISD::FMINIMUM:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
case ISD::FMINNUM:
+ case ISD::FMINNUM_IEEE:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
case ISD::VSELECT:
return LowerFixedLengthVectorSelectToSVE(Op, DAG);
@@ -8488,6 +8496,8 @@
return LowerPARTIAL_REDUCE_MLA(Op, DAG);
case ISD::CLMUL:
return LowerCLMUL(Op, DAG);
+ case ISD::FCANONICALIZE:
+ return LowerFCANONICALIZE(Op, DAG);
}
}
@@ -33502,3 +33512,25 @@
EVT VT) const {
return Subtarget->hasCPA() && UseFEATCPACodegen;
}
+
+SDValue AArch64TargetLowering::LowerFCANONICALIZE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ assert(VT.isVector() && "Expected vector type!");
+
+ SDValue In = Op.getOperand(0);
+ SDValue Pg = getPredicateForVector(DAG, DL, VT);
+
+ // FMINNM follows IEEE754-2008 and will canonicalize a floating-point number.
+
+ if (VT.isScalableVector())
+ return DAG.getNode(AArch64ISD::FMINNM_PRED, DL, VT, Pg, In, In);
+
+ assert(useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) &&
+ "Expected to lower to SVE!");
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+ In = convertToScalableVector(DAG, ContainerVT, In);
+ In = DAG.getNode(AArch64ISD::FMINNM_PRED, DL, ContainerVT, Pg, In, In);
+ return convertFromScalableVector(DAG, VT, In);
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index d8b4d98..49ff76b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -786,7 +786,7 @@
SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMSTORE(SDValue Op, SelectionDAG &DAG) const;
-
+ SDValue LowerFCANONICALIZE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const;
SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 2900cf0..8456694 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6138,6 +6138,8 @@
(v2f64 (FMAXNMv2f64 (v2f64 V128:$Rn), (v2f64 V128:$Rm)))>;
def : Pat<(v2f64 (fcanonicalize (v2f64 V128:$Rn))),
(v2f64 (FMINNMv2f64 (v2f64 V128:$Rn), (v2f64 V128:$Rn)))>;
+def : Pat<(v1f64 (fcanonicalize (v1f64 V64:$Rn))),
+ (v1f64 (FMINNMDrr (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rn)))>;
def : Pat<(v4f32 (fminnum_ieee (v4f32 V128:$Rn), (v4f32 V128:$Rm))),
(v4f32 (FMINNMv4f32 (v4f32 V128:$Rn), (v4f32 V128:$Rm)))>;
def : Pat<(v4f32 (fmaxnum_ieee (v4f32 V128:$Rn), (v4f32 V128:$Rm))),
diff --git a/llvm/test/CodeGen/AArch64/fp-fcanonicalize.ll b/llvm/test/CodeGen/AArch64/fp-fcanonicalize.ll
index 753e2b7..3199d5f 100644
--- a/llvm/test/CodeGen/AArch64/fp-fcanonicalize.ll
+++ b/llvm/test/CodeGen/AArch64/fp-fcanonicalize.ll
@@ -534,6 +534,54 @@
ret double %z
}
+define <1 x double> @fcanonicalize_v1f64(<1 x double> %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v1f64:
+; CHECK-NOFP16-NONEON: // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT: fminnm d0, d0, d0
+; CHECK-NOFP16-NONEON-NEXT: ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_v1f64:
+; CHECK-FP16-NONEON: // %bb.0:
+; CHECK-FP16-NONEON-NEXT: fminnm d0, d0, d0
+; CHECK-FP16-NONEON-NEXT: ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v1f64:
+; CHECK-NOFP16-NEON: // %bb.0:
+; CHECK-NOFP16-NEON-NEXT: fminnm d0, d0, d0
+; CHECK-NOFP16-NEON-NEXT: ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_v1f64:
+; CHECK-FP16-NEON: // %bb.0:
+; CHECK-FP16-NEON-NEXT: fminnm d0, d0, d0
+; CHECK-FP16-NEON-NEXT: ret
+ %z = call <1 x double> @llvm.canonicalize.v1f64(<1 x double> %x)
+ ret <1 x double> %z
+}
+
+define <1 x double> @fcanonicalize_v1f64_nnan(<1 x double> %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v1f64_nnan:
+; CHECK-NOFP16-NONEON: // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT: fminnm d0, d0, d0
+; CHECK-NOFP16-NONEON-NEXT: ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_v1f64_nnan:
+; CHECK-FP16-NONEON: // %bb.0:
+; CHECK-FP16-NONEON-NEXT: fminnm d0, d0, d0
+; CHECK-FP16-NONEON-NEXT: ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v1f64_nnan:
+; CHECK-NOFP16-NEON: // %bb.0:
+; CHECK-NOFP16-NEON-NEXT: fminnm d0, d0, d0
+; CHECK-NOFP16-NEON-NEXT: ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_v1f64_nnan:
+; CHECK-FP16-NEON: // %bb.0:
+; CHECK-FP16-NEON-NEXT: fminnm d0, d0, d0
+; CHECK-FP16-NEON-NEXT: ret
+ %z = call nnan <1 x double> @llvm.canonicalize.v1f64(<1 x double> %x)
+ ret <1 x double> %z
+}
+
define <2 x double> @fcanonicalize_v2f64(<2 x double> %x) {
; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v2f64:
; CHECK-NOFP16-NONEON: // %bb.0:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
index 90a0499..1ed569c 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
@@ -1197,80 +1197,696 @@
ret void
}
+;
+; FMAXIMUMNUM
+;
+
+; Don't use SVE for 64-bit vectors.
+define <4 x half> @fmaximumnum_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fmaximumnum_v4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm v1.4h, v1.4h, v1.4h
+; CHECK-NEXT: fminnm v0.4h, v0.4h, v0.4h
+; CHECK-NEXT: fmaxnm v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
+ %res = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> %op1, <4 x half> %op2)
+ ret <4 x half> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <8 x half> @fmaximumnum_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fmaximumnum_v8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm v1.8h, v1.8h, v1.8h
+; CHECK-NEXT: fminnm v0.8h, v0.8h, v0.8h
+; CHECK-NEXT: fmaxnm v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
+ %res = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> %op1, <8 x half> %op2)
+ ret <8 x half> %res
+}
+
+define void @fmaximumnum_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: fmaximumnum_v16f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
+ %op1 = load <16 x half>, ptr %a
+ %op2 = load <16 x half>, ptr %b
+ %res = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> %op1, <16 x half> %op2)
+ store <16 x half> %res, ptr %a
+ ret void
+}
+
+define void @fmaximumnum_v32f16(ptr %a, ptr %b) #0 {
+; VBITS_EQ_256-LABEL: fmaximumnum_v32f16:
+; VBITS_EQ_256: // %bb.0:
+; VBITS_EQ_256-NEXT: ptrue p0.h, vl16
+; VBITS_EQ_256-NEXT: mov x8, #16 // =0x10
+; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0]
+; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_EQ_256-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; VBITS_EQ_256-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; VBITS_EQ_256-NEXT: fminnm z2.h, p0/m, z2.h, z2.h
+; VBITS_EQ_256-NEXT: fminnm z3.h, p0/m, z3.h, z3.h
+; VBITS_EQ_256-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h
+; VBITS_EQ_256-NEXT: fmaxnm z2.h, p0/m, z2.h, z3.h
+; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x0]
+; VBITS_EQ_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fmaximumnum_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; VBITS_GE_512-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; VBITS_GE_512-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
+ %op1 = load <32 x half>, ptr %a
+ %op2 = load <32 x half>, ptr %b
+ %res = call <32 x half> @llvm.maximumnum.v32f16(<32 x half> %op1, <32 x half> %op2)
+ store <32 x half> %res, ptr %a
+ ret void
+}
+
+define void @fmaximumnum_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fmaximumnum_v64f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
+ %op1 = load <64 x half>, ptr %a
+ %op2 = load <64 x half>, ptr %b
+ %res = call <64 x half> @llvm.maximumnum.v64f16(<64 x half> %op1, <64 x half> %op2)
+ store <64 x half> %res, ptr %a
+ ret void
+}
+
+define void @fmaximumnum_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fmaximumnum_v128f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
+ %op1 = load <128 x half>, ptr %a
+ %op2 = load <128 x half>, ptr %b
+ %res = call <128 x half> @llvm.maximumnum.v128f16(<128 x half> %op1, <128 x half> %op2)
+ store <128 x half> %res, ptr %a
+ ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define <2 x float> @fmaximumnum_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fmaximumnum_v2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm v1.2s, v1.2s, v1.2s
+; CHECK-NEXT: fminnm v0.2s, v0.2s, v0.2s
+; CHECK-NEXT: fmaxnm v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
+ %res = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> %op1, <2 x float> %op2)
+ ret <2 x float> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <4 x float> @fmaximumnum_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fmaximumnum_v4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm v1.4s, v1.4s, v1.4s
+; CHECK-NEXT: fminnm v0.4s, v0.4s, v0.4s
+; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
+ %res = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %op1, <4 x float> %op2)
+ ret <4 x float> %res
+}
+
+define void @fmaximumnum_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: fmaximumnum_v8f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ %op1 = load <8 x float>, ptr %a
+ %op2 = load <8 x float>, ptr %b
+ %res = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> %op1, <8 x float> %op2)
+ store <8 x float> %res, ptr %a
+ ret void
+}
+
+define void @fmaximumnum_v16f32(ptr %a, ptr %b) #0 {
+; VBITS_EQ_256-LABEL: fmaximumnum_v16f32:
+; VBITS_EQ_256: // %bb.0:
+; VBITS_EQ_256-NEXT: ptrue p0.s, vl8
+; VBITS_EQ_256-NEXT: mov x8, #8 // =0x8
+; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0]
+; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_EQ_256-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; VBITS_EQ_256-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; VBITS_EQ_256-NEXT: fminnm z2.s, p0/m, z2.s, z2.s
+; VBITS_EQ_256-NEXT: fminnm z3.s, p0/m, z3.s, z3.s
+; VBITS_EQ_256-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; VBITS_EQ_256-NEXT: fmaxnm z2.s, p0/m, z2.s, z3.s
+; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0]
+; VBITS_EQ_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fmaximumnum_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; VBITS_GE_512-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; VBITS_GE_512-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
+ %op1 = load <16 x float>, ptr %a
+ %op2 = load <16 x float>, ptr %b
+ %res = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> %op1, <16 x float> %op2)
+ store <16 x float> %res, ptr %a
+ ret void
+}
+
+define void @fmaximumnum_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fmaximumnum_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ %op1 = load <32 x float>, ptr %a
+ %op2 = load <32 x float>, ptr %b
+ %res = call <32 x float> @llvm.maximumnum.v32f32(<32 x float> %op1, <32 x float> %op2)
+ store <32 x float> %res, ptr %a
+ ret void
+}
+
+define void @fmaximumnum_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fmaximumnum_v64f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ %op1 = load <64 x float>, ptr %a
+ %op2 = load <64 x float>, ptr %b
+ %res = call <64 x float> @llvm.maximumnum.v64f32(<64 x float> %op1, <64 x float> %op2)
+ store <64 x float> %res, ptr %a
+ ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define <1 x double> @fmaximumnum_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fmaximumnum_v1f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm d1, d1, d1
+; CHECK-NEXT: fminnm d0, d0, d0
+; CHECK-NEXT: fmaxnm d0, d0, d1
+; CHECK-NEXT: ret
+ %res = call <1 x double> @llvm.maximumnum.v1f64(<1 x double> %op1, <1 x double> %op2)
+ ret <1 x double> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <2 x double> @fmaximumnum_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fmaximumnum_v2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm v1.2d, v1.2d, v1.2d
+; CHECK-NEXT: fminnm v0.2d, v0.2d, v0.2d
+; CHECK-NEXT: fmaxnm v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: ret
+ %res = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> %op1, <2 x double> %op2)
+ ret <2 x double> %res
+}
+
+define void @fmaximumnum_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: fmaximumnum_v4f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ %op1 = load <4 x double>, ptr %a
+ %op2 = load <4 x double>, ptr %b
+ %res = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> %op1, <4 x double> %op2)
+ store <4 x double> %res, ptr %a
+ ret void
+}
+
+define void @fmaximumnum_v8f64(ptr %a, ptr %b) #0 {
+; VBITS_EQ_256-LABEL: fmaximumnum_v8f64:
+; VBITS_EQ_256: // %bb.0:
+; VBITS_EQ_256-NEXT: ptrue p0.d, vl4
+; VBITS_EQ_256-NEXT: mov x8, #4 // =0x4
+; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0]
+; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_EQ_256-NEXT: fminnm z1.d, p0/m, z1.d, z1.d
+; VBITS_EQ_256-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; VBITS_EQ_256-NEXT: fminnm z2.d, p0/m, z2.d, z2.d
+; VBITS_EQ_256-NEXT: fminnm z3.d, p0/m, z3.d, z3.d
+; VBITS_EQ_256-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_256-NEXT: fmaxnm z2.d, p0/m, z2.d, z3.d
+; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0]
+; VBITS_EQ_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fmaximumnum_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fminnm z1.d, p0/m, z1.d, z1.d
+; VBITS_GE_512-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; VBITS_GE_512-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
+ %op1 = load <8 x double>, ptr %a
+ %op2 = load <8 x double>, ptr %b
+ %res = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> %op1, <8 x double> %op2)
+ store <8 x double> %res, ptr %a
+ ret void
+}
+
+define void @fmaximumnum_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fmaximumnum_v16f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ %op1 = load <16 x double>, ptr %a
+ %op2 = load <16 x double>, ptr %b
+ %res = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> %op1, <16 x double> %op2)
+ store <16 x double> %res, ptr %a
+ ret void
+}
+
+define void @fmaximumnum_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fmaximumnum_v32f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ %op1 = load <32 x double>, ptr %a
+ %op2 = load <32 x double>, ptr %b
+ %res = call <32 x double> @llvm.maximumnum.v32f64(<32 x double> %op1, <32 x double> %op2)
+ store <32 x double> %res, ptr %a
+ ret void
+}
+
+;
+; FMINIMUMNUM
+;
+
+; Don't use SVE for 64-bit vectors.
+define <4 x half> @fminimumnum_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fminimumnum_v4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm v1.4h, v1.4h, v1.4h
+; CHECK-NEXT: fminnm v0.4h, v0.4h, v0.4h
+; CHECK-NEXT: fminnm v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
+ %res = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> %op1, <4 x half> %op2)
+ ret <4 x half> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <8 x half> @fminimumnum_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fminimumnum_v8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm v1.8h, v1.8h, v1.8h
+; CHECK-NEXT: fminnm v0.8h, v0.8h, v0.8h
+; CHECK-NEXT: fminnm v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
+ %res = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> %op1, <8 x half> %op2)
+ ret <8 x half> %res
+}
+
+define void @fminimumnum_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: fminimumnum_v16f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
+ %op1 = load <16 x half>, ptr %a
+ %op2 = load <16 x half>, ptr %b
+ %res = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> %op1, <16 x half> %op2)
+ store <16 x half> %res, ptr %a
+ ret void
+}
+
+define void @fminimumnum_v32f16(ptr %a, ptr %b) #0 {
+; VBITS_EQ_256-LABEL: fminimumnum_v32f16:
+; VBITS_EQ_256: // %bb.0:
+; VBITS_EQ_256-NEXT: ptrue p0.h, vl16
+; VBITS_EQ_256-NEXT: mov x8, #16 // =0x10
+; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0]
+; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_EQ_256-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; VBITS_EQ_256-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; VBITS_EQ_256-NEXT: fminnm z2.h, p0/m, z2.h, z2.h
+; VBITS_EQ_256-NEXT: fminnm z3.h, p0/m, z3.h, z3.h
+; VBITS_EQ_256-NEXT: fminnm z0.h, p0/m, z0.h, z1.h
+; VBITS_EQ_256-NEXT: fminnm z2.h, p0/m, z2.h, z3.h
+; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x0]
+; VBITS_EQ_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fminimumnum_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; VBITS_GE_512-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; VBITS_GE_512-NEXT: fminnm z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
+ %op1 = load <32 x half>, ptr %a
+ %op2 = load <32 x half>, ptr %b
+ %res = call <32 x half> @llvm.minimumnum.v32f16(<32 x half> %op1, <32 x half> %op2)
+ store <32 x half> %res, ptr %a
+ ret void
+}
+
+define void @fminimumnum_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fminimumnum_v64f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
+ %op1 = load <64 x half>, ptr %a
+ %op2 = load <64 x half>, ptr %b
+ %res = call <64 x half> @llvm.minimumnum.v64f16(<64 x half> %op1, <64 x half> %op2)
+ store <64 x half> %res, ptr %a
+ ret void
+}
+
+define void @fminimumnum_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fminimumnum_v128f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
+ %op1 = load <128 x half>, ptr %a
+ %op2 = load <128 x half>, ptr %b
+ %res = call <128 x half> @llvm.minimumnum.v128f16(<128 x half> %op1, <128 x half> %op2)
+ store <128 x half> %res, ptr %a
+ ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define <2 x float> @fminimumnum_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fminimumnum_v2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm v1.2s, v1.2s, v1.2s
+; CHECK-NEXT: fminnm v0.2s, v0.2s, v0.2s
+; CHECK-NEXT: fminnm v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
+ %res = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> %op1, <2 x float> %op2)
+ ret <2 x float> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <4 x float> @fminimumnum_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fminimumnum_v4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm v1.4s, v1.4s, v1.4s
+; CHECK-NEXT: fminnm v0.4s, v0.4s, v0.4s
+; CHECK-NEXT: fminnm v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
+ %res = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> %op1, <4 x float> %op2)
+ ret <4 x float> %res
+}
+
+define void @fminimumnum_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: fminimumnum_v8f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ %op1 = load <8 x float>, ptr %a
+ %op2 = load <8 x float>, ptr %b
+ %res = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> %op1, <8 x float> %op2)
+ store <8 x float> %res, ptr %a
+ ret void
+}
+
+define void @fminimumnum_v16f32(ptr %a, ptr %b) #0 {
+; VBITS_EQ_256-LABEL: fminimumnum_v16f32:
+; VBITS_EQ_256: // %bb.0:
+; VBITS_EQ_256-NEXT: ptrue p0.s, vl8
+; VBITS_EQ_256-NEXT: mov x8, #8 // =0x8
+; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0]
+; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_EQ_256-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; VBITS_EQ_256-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; VBITS_EQ_256-NEXT: fminnm z2.s, p0/m, z2.s, z2.s
+; VBITS_EQ_256-NEXT: fminnm z3.s, p0/m, z3.s, z3.s
+; VBITS_EQ_256-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; VBITS_EQ_256-NEXT: fminnm z2.s, p0/m, z2.s, z3.s
+; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0]
+; VBITS_EQ_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fminimumnum_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; VBITS_GE_512-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; VBITS_GE_512-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
+ %op1 = load <16 x float>, ptr %a
+ %op2 = load <16 x float>, ptr %b
+ %res = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> %op1, <16 x float> %op2)
+ store <16 x float> %res, ptr %a
+ ret void
+}
+
+define void @fminimumnum_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fminimumnum_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ %op1 = load <32 x float>, ptr %a
+ %op2 = load <32 x float>, ptr %b
+ %res = call <32 x float> @llvm.minimumnum.v32f32(<32 x float> %op1, <32 x float> %op2)
+ store <32 x float> %res, ptr %a
+ ret void
+}
+
+define void @fminimumnum_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fminimumnum_v64f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ %op1 = load <64 x float>, ptr %a
+ %op2 = load <64 x float>, ptr %b
+ %res = call <64 x float> @llvm.minimumnum.v64f32(<64 x float> %op1, <64 x float> %op2)
+ store <64 x float> %res, ptr %a
+ ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define <1 x double> @fminimumnum_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fminimumnum_v1f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm d1, d1, d1
+; CHECK-NEXT: fminnm d0, d0, d0
+; CHECK-NEXT: fminnm d0, d0, d1
+; CHECK-NEXT: ret
+ %res = call <1 x double> @llvm.minimumnum.v1f64(<1 x double> %op1, <1 x double> %op2)
+ ret <1 x double> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <2 x double> @fminimumnum_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fminimumnum_v2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm v1.2d, v1.2d, v1.2d
+; CHECK-NEXT: fminnm v0.2d, v0.2d, v0.2d
+; CHECK-NEXT: fminnm v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: ret
+ %res = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %op1, <2 x double> %op2)
+ ret <2 x double> %res
+}
+
+define void @fminimumnum_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: fminimumnum_v4f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ %op1 = load <4 x double>, ptr %a
+ %op2 = load <4 x double>, ptr %b
+ %res = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> %op1, <4 x double> %op2)
+ store <4 x double> %res, ptr %a
+ ret void
+}
+
+define void @fminimumnum_v8f64(ptr %a, ptr %b) #0 {
+; VBITS_EQ_256-LABEL: fminimumnum_v8f64:
+; VBITS_EQ_256: // %bb.0:
+; VBITS_EQ_256-NEXT: ptrue p0.d, vl4
+; VBITS_EQ_256-NEXT: mov x8, #4 // =0x4
+; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0]
+; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_EQ_256-NEXT: fminnm z1.d, p0/m, z1.d, z1.d
+; VBITS_EQ_256-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; VBITS_EQ_256-NEXT: fminnm z2.d, p0/m, z2.d, z2.d
+; VBITS_EQ_256-NEXT: fminnm z3.d, p0/m, z3.d, z3.d
+; VBITS_EQ_256-NEXT: fminnm z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_256-NEXT: fminnm z2.d, p0/m, z2.d, z3.d
+; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0]
+; VBITS_EQ_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fminimumnum_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fminnm z1.d, p0/m, z1.d, z1.d
+; VBITS_GE_512-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; VBITS_GE_512-NEXT: fminnm z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
+ %op1 = load <8 x double>, ptr %a
+ %op2 = load <8 x double>, ptr %b
+ %res = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> %op1, <8 x double> %op2)
+ store <8 x double> %res, ptr %a
+ ret void
+}
+
+define void @fminimumnum_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fminimumnum_v16f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ %op1 = load <16 x double>, ptr %a
+ %op2 = load <16 x double>, ptr %b
+ %res = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> %op1, <16 x double> %op2)
+ store <16 x double> %res, ptr %a
+ ret void
+}
+
+define void @fminimumnum_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fminimumnum_v32f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ %op1 = load <32 x double>, ptr %a
+ %op2 = load <32 x double>, ptr %b
+ %res = call <32 x double> @llvm.minimumnum.v32f64(<32 x double> %op1, <32 x double> %op2)
+ store <32 x double> %res, ptr %a
+ ret void
+}
+
attributes #0 = { "target-features"="+sve" }
-
-declare <4 x half> @llvm.minnum.v4f16(<4 x half>, <4 x half>)
-declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>)
-declare <16 x half> @llvm.minnum.v16f16(<16 x half>, <16 x half>)
-declare <32 x half> @llvm.minnum.v32f16(<32 x half>, <32 x half>)
-declare <64 x half> @llvm.minnum.v64f16(<64 x half>, <64 x half>)
-declare <128 x half> @llvm.minnum.v128f16(<128 x half>, <128 x half>)
-declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>)
-declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>)
-declare <32 x float> @llvm.minnum.v32f32(<32 x float>, <32 x float>)
-declare <64 x float> @llvm.minnum.v64f32(<64 x float>, <64 x float>)
-declare <1 x double> @llvm.minnum.v1f64(<1 x double>, <1 x double>)
-declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>)
-declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>)
-declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>)
-declare <32 x double> @llvm.minnum.v32f64(<32 x double>, <32 x double>)
-
-declare <4 x half> @llvm.maxnum.v4f16(<4 x half>, <4 x half>)
-declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>)
-declare <16 x half> @llvm.maxnum.v16f16(<16 x half>, <16 x half>)
-declare <32 x half> @llvm.maxnum.v32f16(<32 x half>, <32 x half>)
-declare <64 x half> @llvm.maxnum.v64f16(<64 x half>, <64 x half>)
-declare <128 x half> @llvm.maxnum.v128f16(<128 x half>, <128 x half>)
-declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>)
-declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>)
-declare <32 x float> @llvm.maxnum.v32f32(<32 x float>, <32 x float>)
-declare <64 x float> @llvm.maxnum.v64f32(<64 x float>, <64 x float>)
-declare <1 x double> @llvm.maxnum.v1f64(<1 x double>, <1 x double>)
-declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>)
-declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>)
-declare <16 x double> @llvm.maxnum.v16f64(<16 x double>, <16 x double>)
-declare <32 x double> @llvm.maxnum.v32f64(<32 x double>, <32 x double>)
-
-declare <4 x half> @llvm.minimum.v4f16(<4 x half>, <4 x half>)
-declare <8 x half> @llvm.minimum.v8f16(<8 x half>, <8 x half>)
-declare <16 x half> @llvm.minimum.v16f16(<16 x half>, <16 x half>)
-declare <32 x half> @llvm.minimum.v32f16(<32 x half>, <32 x half>)
-declare <64 x half> @llvm.minimum.v64f16(<64 x half>, <64 x half>)
-declare <128 x half> @llvm.minimum.v128f16(<128 x half>, <128 x half>)
-declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.minimum.v8f32(<8 x float>, <8 x float>)
-declare <16 x float> @llvm.minimum.v16f32(<16 x float>, <16 x float>)
-declare <32 x float> @llvm.minimum.v32f32(<32 x float>, <32 x float>)
-declare <64 x float> @llvm.minimum.v64f32(<64 x float>, <64 x float>)
-declare <1 x double> @llvm.minimum.v1f64(<1 x double>, <1 x double>)
-declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>)
-declare <8 x double> @llvm.minimum.v8f64(<8 x double>, <8 x double>)
-declare <16 x double> @llvm.minimum.v16f64(<16 x double>, <16 x double>)
-declare <32 x double> @llvm.minimum.v32f64(<32 x double>, <32 x double>)
-
-declare <4 x half> @llvm.maximum.v4f16(<4 x half>, <4 x half>)
-declare <8 x half> @llvm.maximum.v8f16(<8 x half>, <8 x half>)
-declare <16 x half> @llvm.maximum.v16f16(<16 x half>, <16 x half>)
-declare <32 x half> @llvm.maximum.v32f16(<32 x half>, <32 x half>)
-declare <64 x half> @llvm.maximum.v64f16(<64 x half>, <64 x half>)
-declare <128 x half> @llvm.maximum.v128f16(<128 x half>, <128 x half>)
-declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.maximum.v8f32(<8 x float>, <8 x float>)
-declare <16 x float> @llvm.maximum.v16f32(<16 x float>, <16 x float>)
-declare <32 x float> @llvm.maximum.v32f32(<32 x float>, <32 x float>)
-declare <64 x float> @llvm.maximum.v64f32(<64 x float>, <64 x float>)
-declare <1 x double> @llvm.maximum.v1f64(<1 x double>, <1 x double>)
-declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>)
-declare <8 x double> @llvm.maximum.v8f64(<8 x double>, <8 x double>)
-declare <16 x double> @llvm.maximum.v16f64(<16 x double>, <16 x double>)
-declare <32 x double> @llvm.maximum.v32f64(<32 x double>, <32 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll
index 9ac6fc9..6fedf63 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll
@@ -1881,137 +1881,272 @@
ret void
}
+;
+; FCANONICALIZE -> FMINNM
+;
+
+; Don't use SVE for 64-bit vectors.
+define <4 x half> @fcanonicalize_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
+; CHECK-LABEL: fcanonicalize_v4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm v0.4h, v0.4h, v0.4h
+; CHECK-NEXT: ret
+ %res = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %op)
+ ret <4 x half> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <8 x half> @fcanonicalize_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
+; CHECK-LABEL: fcanonicalize_v8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm v0.8h, v0.8h, v0.8h
+; CHECK-NEXT: ret
+ %res = call <8 x half> @llvm.canonicalize.v8f16(<8 x half> %op)
+ ret <8 x half> %res
+}
+
+define void @fcanonicalize_v16f16(ptr %a) vscale_range(2,0) #0 {
+; CHECK-LABEL: fcanonicalize_v16f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
+ %op = load <16 x half>, ptr %a
+ %res = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> %op)
+ store <16 x half> %res, ptr %a
+ ret void
+}
+
+define void @fcanonicalize_v32f16(ptr %a) #0 {
+; VBITS_GE_256-LABEL: fcanonicalize_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; VBITS_GE_256-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fcanonicalize_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
+ %op = load <32 x half>, ptr %a
+ %res = call <32 x half> @llvm.canonicalize.v32f16(<32 x half> %op)
+ store <32 x half> %res, ptr %a
+ ret void
+}
+
+define void @fcanonicalize_v64f16(ptr %a) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcanonicalize_v64f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
+ %op = load <64 x half>, ptr %a
+ %res = call <64 x half> @llvm.canonicalize.v64f16(<64 x half> %op)
+ store <64 x half> %res, ptr %a
+ ret void
+}
+
+define void @fcanonicalize_v128f16(ptr %a) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcanonicalize_v128f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
+ %op = load <128 x half>, ptr %a
+ %res = call <128 x half> @llvm.canonicalize.v128f16(<128 x half> %op)
+ store <128 x half> %res, ptr %a
+ ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define <2 x float> @fcanonicalize_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
+; CHECK-LABEL: fcanonicalize_v2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm v0.2s, v0.2s, v0.2s
+; CHECK-NEXT: ret
+ %res = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> %op)
+ ret <2 x float> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <4 x float> @fcanonicalize_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
+; CHECK-LABEL: fcanonicalize_v4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm v0.4s, v0.4s, v0.4s
+; CHECK-NEXT: ret
+ %res = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %op)
+ ret <4 x float> %res
+}
+
+define void @fcanonicalize_v8f32(ptr %a) vscale_range(2,0) #0 {
+; CHECK-LABEL: fcanonicalize_v8f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ %op = load <8 x float>, ptr %a
+ %res = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> %op)
+ store <8 x float> %res, ptr %a
+ ret void
+}
+
+define void @fcanonicalize_v16f32(ptr %a) #0 {
+; VBITS_GE_256-LABEL: fcanonicalize_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; VBITS_GE_256-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fcanonicalize_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
+ %op = load <16 x float>, ptr %a
+ %res = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> %op)
+ store <16 x float> %res, ptr %a
+ ret void
+}
+
+define void @fcanonicalize_v32f32(ptr %a) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcanonicalize_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ %op = load <32 x float>, ptr %a
+ %res = call <32 x float> @llvm.canonicalize.v32f32(<32 x float> %op)
+ store <32 x float> %res, ptr %a
+ ret void
+}
+
+define void @fcanonicalize_v64f32(ptr %a) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcanonicalize_v64f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ %op = load <64 x float>, ptr %a
+ %res = call <64 x float> @llvm.canonicalize.v64f32(<64 x float> %op)
+ store <64 x float> %res, ptr %a
+ ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define <1 x double> @fcanonicalize_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
+; CHECK-LABEL: fcanonicalize_v1f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm d0, d0, d0
+; CHECK-NEXT: ret
+ %res = call <1 x double> @llvm.canonicalize.v1f64(<1 x double> %op)
+ ret <1 x double> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <2 x double> @fcanonicalize_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
+; CHECK-LABEL: fcanonicalize_v2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm v0.2d, v0.2d, v0.2d
+; CHECK-NEXT: ret
+ %res = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> %op)
+ ret <2 x double> %res
+}
+
+define void @fcanonicalize_v4f64(ptr %a) vscale_range(2,0) #0 {
+; CHECK-LABEL: fcanonicalize_v4f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ %op = load <4 x double>, ptr %a
+ %res = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> %op)
+ store <4 x double> %res, ptr %a
+ ret void
+}
+
+define void @fcanonicalize_v8f64(ptr %a) #0 {
+; VBITS_GE_256-LABEL: fcanonicalize_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; VBITS_GE_256-NEXT: fminnm z1.d, p0/m, z1.d, z1.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fcanonicalize_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
+ %op = load <8 x double>, ptr %a
+ %res = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> %op)
+ store <8 x double> %res, ptr %a
+ ret void
+}
+
+define void @fcanonicalize_v16f64(ptr %a) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcanonicalize_v16f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ %op = load <16 x double>, ptr %a
+ %res = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> %op)
+ store <16 x double> %res, ptr %a
+ ret void
+}
+
+define void @fcanonicalize_v32f64(ptr %a) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcanonicalize_v32f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ %op = load <32 x double>, ptr %a
+ %res = call <32 x double> @llvm.canonicalize.v32f64(<32 x double> %op)
+ store <32 x double> %res, ptr %a
+ ret void
+}
+
attributes #0 = { "target-features"="+sve" }
-
-declare <4 x half> @llvm.ceil.v4f16(<4 x half>)
-declare <8 x half> @llvm.ceil.v8f16(<8 x half>)
-declare <16 x half> @llvm.ceil.v16f16(<16 x half>)
-declare <32 x half> @llvm.ceil.v32f16(<32 x half>)
-declare <64 x half> @llvm.ceil.v64f16(<64 x half>)
-declare <128 x half> @llvm.ceil.v128f16(<128 x half>)
-declare <2 x float> @llvm.ceil.v2f32(<2 x float>)
-declare <4 x float> @llvm.ceil.v4f32(<4 x float>)
-declare <8 x float> @llvm.ceil.v8f32(<8 x float>)
-declare <16 x float> @llvm.ceil.v16f32(<16 x float>)
-declare <32 x float> @llvm.ceil.v32f32(<32 x float>)
-declare <64 x float> @llvm.ceil.v64f32(<64 x float>)
-declare <1 x double> @llvm.ceil.v1f64(<1 x double>)
-declare <2 x double> @llvm.ceil.v2f64(<2 x double>)
-declare <4 x double> @llvm.ceil.v4f64(<4 x double>)
-declare <8 x double> @llvm.ceil.v8f64(<8 x double>)
-declare <16 x double> @llvm.ceil.v16f64(<16 x double>)
-declare <32 x double> @llvm.ceil.v32f64(<32 x double>)
-
-declare <4 x half> @llvm.floor.v4f16(<4 x half>)
-declare <8 x half> @llvm.floor.v8f16(<8 x half>)
-declare <16 x half> @llvm.floor.v16f16(<16 x half>)
-declare <32 x half> @llvm.floor.v32f16(<32 x half>)
-declare <64 x half> @llvm.floor.v64f16(<64 x half>)
-declare <128 x half> @llvm.floor.v128f16(<128 x half>)
-declare <2 x float> @llvm.floor.v2f32(<2 x float>)
-declare <4 x float> @llvm.floor.v4f32(<4 x float>)
-declare <8 x float> @llvm.floor.v8f32(<8 x float>)
-declare <16 x float> @llvm.floor.v16f32(<16 x float>)
-declare <32 x float> @llvm.floor.v32f32(<32 x float>)
-declare <64 x float> @llvm.floor.v64f32(<64 x float>)
-declare <1 x double> @llvm.floor.v1f64(<1 x double>)
-declare <2 x double> @llvm.floor.v2f64(<2 x double>)
-declare <4 x double> @llvm.floor.v4f64(<4 x double>)
-declare <8 x double> @llvm.floor.v8f64(<8 x double>)
-declare <16 x double> @llvm.floor.v16f64(<16 x double>)
-declare <32 x double> @llvm.floor.v32f64(<32 x double>)
-
-declare <4 x half> @llvm.nearbyint.v4f16(<4 x half>)
-declare <8 x half> @llvm.nearbyint.v8f16(<8 x half>)
-declare <16 x half> @llvm.nearbyint.v16f16(<16 x half>)
-declare <32 x half> @llvm.nearbyint.v32f16(<32 x half>)
-declare <64 x half> @llvm.nearbyint.v64f16(<64 x half>)
-declare <128 x half> @llvm.nearbyint.v128f16(<128 x half>)
-declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>)
-declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>)
-declare <8 x float> @llvm.nearbyint.v8f32(<8 x float>)
-declare <16 x float> @llvm.nearbyint.v16f32(<16 x float>)
-declare <32 x float> @llvm.nearbyint.v32f32(<32 x float>)
-declare <64 x float> @llvm.nearbyint.v64f32(<64 x float>)
-declare <1 x double> @llvm.nearbyint.v1f64(<1 x double>)
-declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>)
-declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>)
-declare <8 x double> @llvm.nearbyint.v8f64(<8 x double>)
-declare <16 x double> @llvm.nearbyint.v16f64(<16 x double>)
-declare <32 x double> @llvm.nearbyint.v32f64(<32 x double>)
-
-declare <4 x half> @llvm.rint.v4f16(<4 x half>)
-declare <8 x half> @llvm.rint.v8f16(<8 x half>)
-declare <16 x half> @llvm.rint.v16f16(<16 x half>)
-declare <32 x half> @llvm.rint.v32f16(<32 x half>)
-declare <64 x half> @llvm.rint.v64f16(<64 x half>)
-declare <128 x half> @llvm.rint.v128f16(<128 x half>)
-declare <2 x float> @llvm.rint.v2f32(<2 x float>)
-declare <4 x float> @llvm.rint.v4f32(<4 x float>)
-declare <8 x float> @llvm.rint.v8f32(<8 x float>)
-declare <16 x float> @llvm.rint.v16f32(<16 x float>)
-declare <32 x float> @llvm.rint.v32f32(<32 x float>)
-declare <64 x float> @llvm.rint.v64f32(<64 x float>)
-declare <1 x double> @llvm.rint.v1f64(<1 x double>)
-declare <2 x double> @llvm.rint.v2f64(<2 x double>)
-declare <4 x double> @llvm.rint.v4f64(<4 x double>)
-declare <8 x double> @llvm.rint.v8f64(<8 x double>)
-declare <16 x double> @llvm.rint.v16f64(<16 x double>)
-declare <32 x double> @llvm.rint.v32f64(<32 x double>)
-
-declare <4 x half> @llvm.round.v4f16(<4 x half>)
-declare <8 x half> @llvm.round.v8f16(<8 x half>)
-declare <16 x half> @llvm.round.v16f16(<16 x half>)
-declare <32 x half> @llvm.round.v32f16(<32 x half>)
-declare <64 x half> @llvm.round.v64f16(<64 x half>)
-declare <128 x half> @llvm.round.v128f16(<128 x half>)
-declare <2 x float> @llvm.round.v2f32(<2 x float>)
-declare <4 x float> @llvm.round.v4f32(<4 x float>)
-declare <8 x float> @llvm.round.v8f32(<8 x float>)
-declare <16 x float> @llvm.round.v16f32(<16 x float>)
-declare <32 x float> @llvm.round.v32f32(<32 x float>)
-declare <64 x float> @llvm.round.v64f32(<64 x float>)
-declare <1 x double> @llvm.round.v1f64(<1 x double>)
-declare <2 x double> @llvm.round.v2f64(<2 x double>)
-declare <4 x double> @llvm.round.v4f64(<4 x double>)
-declare <8 x double> @llvm.round.v8f64(<8 x double>)
-declare <16 x double> @llvm.round.v16f64(<16 x double>)
-declare <32 x double> @llvm.round.v32f64(<32 x double>)
-
-declare <4 x half> @llvm.roundeven.v4f16(<4 x half>)
-declare <8 x half> @llvm.roundeven.v8f16(<8 x half>)
-declare <16 x half> @llvm.roundeven.v16f16(<16 x half>)
-declare <32 x half> @llvm.roundeven.v32f16(<32 x half>)
-declare <64 x half> @llvm.roundeven.v64f16(<64 x half>)
-declare <128 x half> @llvm.roundeven.v128f16(<128 x half>)
-declare <2 x float> @llvm.roundeven.v2f32(<2 x float>)
-declare <4 x float> @llvm.roundeven.v4f32(<4 x float>)
-declare <8 x float> @llvm.roundeven.v8f32(<8 x float>)
-declare <16 x float> @llvm.roundeven.v16f32(<16 x float>)
-declare <32 x float> @llvm.roundeven.v32f32(<32 x float>)
-declare <64 x float> @llvm.roundeven.v64f32(<64 x float>)
-declare <1 x double> @llvm.roundeven.v1f64(<1 x double>)
-declare <2 x double> @llvm.roundeven.v2f64(<2 x double>)
-declare <4 x double> @llvm.roundeven.v4f64(<4 x double>)
-declare <8 x double> @llvm.roundeven.v8f64(<8 x double>)
-declare <16 x double> @llvm.roundeven.v16f64(<16 x double>)
-declare <32 x double> @llvm.roundeven.v32f64(<32 x double>)
-
-declare <4 x half> @llvm.trunc.v4f16(<4 x half>)
-declare <8 x half> @llvm.trunc.v8f16(<8 x half>)
-declare <16 x half> @llvm.trunc.v16f16(<16 x half>)
-declare <32 x half> @llvm.trunc.v32f16(<32 x half>)
-declare <64 x half> @llvm.trunc.v64f16(<64 x half>)
-declare <128 x half> @llvm.trunc.v128f16(<128 x half>)
-declare <2 x float> @llvm.trunc.v2f32(<2 x float>)
-declare <4 x float> @llvm.trunc.v4f32(<4 x float>)
-declare <8 x float> @llvm.trunc.v8f32(<8 x float>)
-declare <16 x float> @llvm.trunc.v16f32(<16 x float>)
-declare <32 x float> @llvm.trunc.v32f32(<32 x float>)
-declare <64 x float> @llvm.trunc.v64f32(<64 x float>)
-declare <1 x double> @llvm.trunc.v1f64(<1 x double>)
-declare <2 x double> @llvm.trunc.v2f64(<2 x double>)
-declare <4 x double> @llvm.trunc.v4f64(<4 x double>)
-declare <8 x double> @llvm.trunc.v8f64(<8 x double>)
-declare <16 x double> @llvm.trunc.v16f64(<16 x double>)
-declare <32 x double> @llvm.trunc.v32f64(<32 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve-fp.ll b/llvm/test/CodeGen/AArch64/sve-fp.ll
index 2f3f99c..5cf8f82 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp.ll
@@ -1,5 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,SVE
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,SME
+
+target triple = "aarch64-unknown-linux-gnu"
define <vscale x 8 x half> @fadd_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
; CHECK-LABEL: fadd_nxv8f16:
@@ -495,13 +498,20 @@
%complex = type { { double, double } }
define void @scalar_to_vector(ptr %outval, <vscale x 2 x i1> %pred, <vscale x 2 x double> %in1, <vscale x 2 x double> %in2) {
-; CHECK-LABEL: scalar_to_vector:
-; CHECK: // %bb.0:
-; CHECK-NEXT: faddv d0, p0, z0.d
-; CHECK-NEXT: faddv d1, p0, z1.d
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: str q0, [x0]
-; CHECK-NEXT: ret
+; SVE-LABEL: scalar_to_vector:
+; SVE: // %bb.0:
+; SVE-NEXT: faddv d0, p0, z0.d
+; SVE-NEXT: faddv d1, p0, z1.d
+; SVE-NEXT: mov v0.d[1], v1.d[0]
+; SVE-NEXT: str q0, [x0]
+; SVE-NEXT: ret
+;
+; SME-LABEL: scalar_to_vector:
+; SME: // %bb.0:
+; SME-NEXT: faddv d0, p0, z0.d
+; SME-NEXT: faddv d1, p0, z1.d
+; SME-NEXT: stp d0, d1, [x0]
+; SME-NEXT: ret
%imagp = getelementptr inbounds %complex, ptr %outval, i64 0, i32 0, i32 1
%1 = call double @llvm.aarch64.sve.faddv.nxv2f64(<vscale x 2 x i1> %pred, <vscale x 2 x double> %in1)
%2 = call double @llvm.aarch64.sve.faddv.nxv2f64(<vscale x 2 x i1> %pred, <vscale x 2 x double> %in2)
@@ -1089,72 +1099,206 @@
ret <vscale x 2 x double> %res
}
-declare <vscale x 8 x half> @llvm.aarch64.sve.frecps.x.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 4 x float> @llvm.aarch64.sve.frecps.x.nxv4f32(<vscale x 4 x float> , <vscale x 4 x float>)
-declare <vscale x 2 x double> @llvm.aarch64.sve.frecps.x.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 8 x half> @canonicalize_nxv8f16(<vscale x 8 x half> %a) {
+; CHECK-LABEL: canonicalize_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: ret
+ %r = call <vscale x 8 x half> @llvm.canonicalize.nxv8f16(<vscale x 8 x half> %a)
+ ret <vscale x 8 x half> %r
+}
-declare <vscale x 8 x half> @llvm.aarch64.sve.frsqrts.x.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 4 x float> @llvm.aarch64.sve.frsqrts.x.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 2 x double> @llvm.aarch64.sve.frsqrts.x.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 4 x half> @canonicalize_nxv4f16(<vscale x 4 x half> %a) {
+; CHECK-LABEL: canonicalize_nxv4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: ret
+ %r = call <vscale x 4 x half> @llvm.canonicalize.nxv4f16(<vscale x 4 x half> %a)
+ ret <vscale x 4 x half> %r
+}
-declare <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
-declare <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 2 x float> @llvm.fma.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 8 x half> @llvm.fma.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>)
-declare <vscale x 2 x half> @llvm.fma.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>)
+define <vscale x 2 x half> @canonicalize_nxv2f16(<vscale x 2 x half> %a) {
+; CHECK-LABEL: canonicalize_nxv2f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: ret
+ %r = call <vscale x 2 x half> @llvm.canonicalize.nxv2f16(<vscale x 2 x half> %a)
+ ret <vscale x 2 x half> %r
+}
-declare <vscale x 8 x half> @llvm.sqrt.nxv8f16( <vscale x 8 x half>)
-declare <vscale x 4 x half> @llvm.sqrt.nxv4f16( <vscale x 4 x half>)
-declare <vscale x 2 x half> @llvm.sqrt.nxv2f16( <vscale x 2 x half>)
-declare <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float>)
-declare <vscale x 2 x float> @llvm.sqrt.nxv2f32(<vscale x 2 x float>)
-declare <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double>)
+define <vscale x 4 x float> @canonicalize_nxv4f32(<vscale x 4 x float> %a) {
+; CHECK-LABEL: canonicalize_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: ret
+ %r = call <vscale x 4 x float> @llvm.canonicalize.nxv4f32(<vscale x 4 x float> %a)
+ ret <vscale x 4 x float> %r
+}
-declare <vscale x 8 x half> @llvm.fabs.nxv8f16( <vscale x 8 x half>)
-declare <vscale x 4 x half> @llvm.fabs.nxv4f16( <vscale x 4 x half>)
-declare <vscale x 2 x half> @llvm.fabs.nxv2f16( <vscale x 2 x half>)
-declare <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float>)
-declare <vscale x 2 x float> @llvm.fabs.nxv2f32(<vscale x 2 x float>)
-declare <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double>)
+define <vscale x 2 x float> @canonicalize_nxv2f32(<vscale x 2 x float> %a) {
+; CHECK-LABEL: canonicalize_nxv2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: ret
+ %r = call <vscale x 2 x float> @llvm.canonicalize.nxv2f32(<vscale x 2 x float> %a)
+ ret <vscale x 2 x float> %r
+}
-declare <vscale x 16 x half> @llvm.maxnum.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>)
-declare <vscale x 8 x half> @llvm.maxnum.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 4 x half> @llvm.maxnum.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>)
-declare <vscale x 2 x half> @llvm.maxnum.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>)
-declare <vscale x 8 x float> @llvm.maxnum.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>)
-declare <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 2 x float> @llvm.maxnum.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 4 x double> @llvm.maxnum.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>)
-declare <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
-declare <vscale x 16 x half> @llvm.minnum.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>)
-declare <vscale x 8 x half> @llvm.minnum.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 4 x half> @llvm.minnum.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>)
-declare <vscale x 2 x half> @llvm.minnum.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>)
-declare <vscale x 8 x float> @llvm.minnum.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>)
-declare <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 2 x float> @llvm.minnum.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 4 x double> @llvm.minnum.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>)
-declare <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @canonicalize_nxv2f64(<vscale x 2 x double> %a) {
+; CHECK-LABEL: canonicalize_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT: ret
+ %r = call <vscale x 2 x double> @llvm.canonicalize.nxv2f64(<vscale x 2 x double> %a)
+ ret <vscale x 2 x double> %r
+}
-declare <vscale x 16 x half> @llvm.maximum.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>)
-declare <vscale x 8 x half> @llvm.maximum.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 4 x half> @llvm.maximum.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>)
-declare <vscale x 2 x half> @llvm.maximum.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>)
-declare <vscale x 8 x float> @llvm.maximum.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>)
-declare <vscale x 4 x float> @llvm.maximum.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 2 x float> @llvm.maximum.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 4 x double> @llvm.maximum.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>)
-declare <vscale x 2 x double> @llvm.maximum.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
-declare <vscale x 16 x half> @llvm.minimum.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>)
-declare <vscale x 8 x half> @llvm.minimum.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 4 x half> @llvm.minimum.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>)
-declare <vscale x 2 x half> @llvm.minimum.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>)
-declare <vscale x 8 x float> @llvm.minimum.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>)
-declare <vscale x 4 x float> @llvm.minimum.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 2 x float> @llvm.minimum.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 4 x double> @llvm.minimum.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>)
-declare <vscale x 2 x double> @llvm.minimum.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 8 x half> @maximumnum_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: maximumnum_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x half> @llvm.maximumnum.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
+ ret <vscale x 8 x half> %res
+}
-; Function Attrs: nounwind readnone
-declare double @llvm.aarch64.sve.faddv.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>) #2
+define <vscale x 4 x half> @maximumnum_nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
+; CHECK-LABEL: maximumnum_nxv4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x half> @llvm.maximumnum.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @maximumnum_nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b) {
+; CHECK-LABEL: maximumnum_nxv2f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x half> @llvm.maximumnum.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @maximumnum_nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: maximumnum_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x float> @llvm.maximumnum.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @maximumnum_nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b) {
+; CHECK-LABEL: maximumnum_nxv2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x float> @llvm.maximumnum.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @maximumnum_nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: maximumnum_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x double> @llvm.maximumnum.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @minimumnum_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: minimumnum_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x half> @llvm.minimumnum.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @minimumnum_nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
+; CHECK-LABEL: minimumnum_nxv4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x half> @llvm.minimumnum.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @minimumnum_nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b) {
+; CHECK-LABEL: minimumnum_nxv2f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x half> @llvm.minimumnum.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @minimumnum_nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: minimumnum_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x float> @llvm.minimumnum.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @minimumnum_nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b) {
+; CHECK-LABEL: minimumnum_nxv2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x float> @llvm.minimumnum.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @minimumnum_nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: minimumnum_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x double> @llvm.minimumnum.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
+ ret <vscale x 2 x double> %res
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
index 369b698..da806dd 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
@@ -1841,42 +1841,1072 @@
ret void
}
-declare <4 x half> @llvm.minnum.v4f16(<4 x half>, <4 x half>)
-declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>)
-declare <16 x half> @llvm.minnum.v16f16(<16 x half>, <16 x half>)
-declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>)
-declare <1 x double> @llvm.minnum.v1f64(<1 x double>, <1 x double>)
-declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>)
+;
+; FMAXIMUMNUM
+;
-declare <4 x half> @llvm.maxnum.v4f16(<4 x half>, <4 x half>)
-declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>)
-declare <16 x half> @llvm.maxnum.v16f16(<16 x half>, <16 x half>)
-declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>)
-declare <1 x double> @llvm.maxnum.v1f64(<1 x double>, <1 x double>)
-declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>)
+define <4 x half> @fmaximumnum_v4f16(<4 x half> %op1, <4 x half> %op2) {
+; CHECK-LABEL: fmaximumnum_v4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fmaximumnum_v4f16:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #32
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #30]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #28]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #26]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #24]
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT: add sp, sp, #32
+; NONEON-NOSVE-NEXT: ret
+ %res = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> %op1, <4 x half> %op2)
+ ret <4 x half> %res
+}
-declare <4 x half> @llvm.minimum.v4f16(<4 x half>, <4 x half>)
-declare <8 x half> @llvm.minimum.v8f16(<8 x half>, <8 x half>)
-declare <16 x half> @llvm.minimum.v16f16(<16 x half>, <16 x half>)
-declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.minimum.v8f32(<8 x float>, <8 x float>)
-declare <1 x double> @llvm.minimum.v1f64(<1 x double>, <1 x double>)
-declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>)
+define <8 x half> @fmaximumnum_v8f16(<8 x half> %op1, <8 x half> %op2) {
+; CHECK-LABEL: fmaximumnum_v8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fmaximumnum_v8f16:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #46]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #44]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #42]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #40]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #38]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #36]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #34]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #32]
+; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT: add sp, sp, #48
+; NONEON-NOSVE-NEXT: ret
+ %res = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> %op1, <8 x half> %op2)
+ ret <8 x half> %res
+}
-declare <4 x half> @llvm.maximum.v4f16(<4 x half>, <4 x half>)
-declare <8 x half> @llvm.maximum.v8f16(<8 x half>, <8 x half>)
-declare <16 x half> @llvm.maximum.v16f16(<16 x half>, <16 x half>)
-declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.maximum.v8f32(<8 x float>, <8 x float>)
-declare <1 x double> @llvm.maximum.v1f64(<1 x double>, <1 x double>)
-declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>)
+define void @fmaximumnum_v16f16(ptr %a, ptr %b) {
+; CHECK-LABEL: fmaximumnum_v16f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp q0, q3, [x1]
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: ldp q1, q2, [x0]
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: fminnm z3.h, p0/m, z3.h, z3.h
+; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT: fminnm z2.h, p0/m, z2.h, z2.h
+; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: fmaxnm z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT: stp q0, q2, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fmaximumnum_v16f16:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #96
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT: ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT: ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT: stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #94]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #92]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #90]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #88]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #86]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #84]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #82]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #80]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #78]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #76]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #74]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #72]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #70]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #68]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #66]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #64]
+; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #96
+; NONEON-NOSVE-NEXT: ret
+ %op1 = load <16 x half>, ptr %a
+ %op2 = load <16 x half>, ptr %b
+ %res = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> %op1, <16 x half> %op2)
+ store <16 x half> %res, ptr %a
+ ret void
+}
+
+define <2 x float> @fmaximumnum_v2f32(<2 x float> %op1, <2 x float> %op2) {
+; CHECK-LABEL: fmaximumnum_v2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl2
+; CHECK-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fmaximumnum_v2f32:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #32
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #12]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: fmaxnm s2, s1, s0
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #24]
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT: add sp, sp, #32
+; NONEON-NOSVE-NEXT: ret
+ %res = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> %op1, <2 x float> %op2)
+ ret <2 x float> %res
+}
+
+define <4 x float> @fmaximumnum_v4f32(<4 x float> %op1, <4 x float> %op2) {
+; CHECK-LABEL: fmaximumnum_v4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fmaximumnum_v4f32:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #12]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: fmaxnm s2, s1, s0
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #4]
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #40]
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fmaxnm s2, s1, s0
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT: ldr s1, [sp]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #32]
+; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT: add sp, sp, #48
+; NONEON-NOSVE-NEXT: ret
+ %res = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %op1, <4 x float> %op2)
+ ret <4 x float> %res
+}
+
+define void @fmaximumnum_v8f32(ptr %a, ptr %b) {
+; CHECK-LABEL: fmaximumnum_v8f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp q0, q3, [x1]
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: ldp q1, q2, [x0]
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: fminnm z3.s, p0/m, z3.s, z3.s
+; CHECK-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT: fminnm z2.s, p0/m, z2.s, z2.s
+; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: fmaxnm z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT: stp q0, q2, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fmaximumnum_v8f32:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #96
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT: ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT: ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT: stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #44]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: fmaxnm s2, s1, s0
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #40]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #36]
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #88]
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fmaxnm s2, s1, s0
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #32]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #12]
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #80]
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fmaxnm s2, s1, s0
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #4]
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #72]
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fmaxnm s2, s1, s0
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT: ldr s1, [sp]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #64]
+; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #96
+; NONEON-NOSVE-NEXT: ret
+ %op1 = load <8 x float>, ptr %a
+ %op2 = load <8 x float>, ptr %b
+ %res = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> %op1, <8 x float> %op2)
+ store <8 x float> %res, ptr %a
+ ret void
+}
+
+define <1 x double> @fmaximumnum_v1f64(<1 x double> %op1, <1 x double> %op2) {
+; CHECK-LABEL: fmaximumnum_v1f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm d1, d1, d1
+; CHECK-NEXT: fminnm d0, d0, d0
+; CHECK-NEXT: fmaxnm d0, d0, d1
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fmaximumnum_v1f64:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #16
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT: fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT: fmaxnm d0, d0, d1
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %res = call <1 x double> @llvm.maximumnum.v1f64(<1 x double> %op1, <1 x double> %op2)
+ ret <1 x double> %res
+}
+
+define <2 x double> @fmaximumnum_v2f64(<2 x double> %op1, <2 x double> %op2) {
+; CHECK-LABEL: fmaximumnum_v2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fmaximumnum_v2f64:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT: ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT: fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT: fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT: fmaxnm d2, d1, d0
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT: ldr d1, [sp]
+; NONEON-NOSVE-NEXT: fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT: fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT: fmaxnm d0, d1, d0
+; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #32]
+; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT: add sp, sp, #48
+; NONEON-NOSVE-NEXT: ret
+ %res = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> %op1, <2 x double> %op2)
+ ret <2 x double> %res
+}
+
+define void @fmaximumnum_v4f64(ptr %a, ptr %b) {
+; CHECK-LABEL: fmaximumnum_v4f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp q0, q3, [x1]
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: ldp q1, q2, [x0]
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT: fminnm z3.d, p0/m, z3.d, z3.d
+; CHECK-NEXT: fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT: fminnm z2.d, p0/m, z2.d, z2.d
+; CHECK-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: fmaxnm z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT: stp q0, q2, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fmaximumnum_v4f64:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #96
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT: ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT: ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT: stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT: ldr d1, [sp, #40]
+; NONEON-NOSVE-NEXT: fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT: fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT: fmaxnm d2, d1, d0
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT: ldr d1, [sp, #32]
+; NONEON-NOSVE-NEXT: fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT: fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT: fmaxnm d0, d1, d0
+; NONEON-NOSVE-NEXT: ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT: fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT: fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT: fmaxnm d2, d1, d0
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT: ldr d1, [sp]
+; NONEON-NOSVE-NEXT: fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT: fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT: fmaxnm d0, d1, d0
+; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #64]
+; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #96
+; NONEON-NOSVE-NEXT: ret
+ %op1 = load <4 x double>, ptr %a
+ %op2 = load <4 x double>, ptr %b
+ %res = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> %op1, <4 x double> %op2)
+ store <4 x double> %res, ptr %a
+ ret void
+}
+
+;
+; FMINIMUMNUM
+;
+
+define <4 x half> @fminimumnum_v4f16(<4 x half> %op1, <4 x half> %op2) {
+; CHECK-LABEL: fminimumnum_v4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fminimumnum_v4f16:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #32
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #30]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #28]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #26]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #24]
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT: add sp, sp, #32
+; NONEON-NOSVE-NEXT: ret
+ %res = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> %op1, <4 x half> %op2)
+ ret <4 x half> %res
+}
+
+define <8 x half> @fminimumnum_v8f16(<8 x half> %op1, <8 x half> %op2) {
+; CHECK-LABEL: fminimumnum_v8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fminimumnum_v8f16:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #46]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #44]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #42]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #40]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #38]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #36]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #34]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #32]
+; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT: add sp, sp, #48
+; NONEON-NOSVE-NEXT: ret
+ %res = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> %op1, <8 x half> %op2)
+ ret <8 x half> %res
+}
+
+define void @fminimumnum_v16f16(ptr %a, ptr %b) {
+; CHECK-LABEL: fminimumnum_v16f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp q0, q3, [x1]
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: ldp q1, q2, [x0]
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: fminnm z3.h, p0/m, z3.h, z3.h
+; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT: fminnm z2.h, p0/m, z2.h, z2.h
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: fminnm z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT: stp q0, q2, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fminimumnum_v16f16:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #96
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT: ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT: ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT: stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #94]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #92]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #90]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #88]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #86]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #84]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #82]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #80]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #78]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #76]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #74]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #72]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #70]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #68]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr h1, [sp]
+; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #66]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #64]
+; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #96
+; NONEON-NOSVE-NEXT: ret
+ %op1 = load <16 x half>, ptr %a
+ %op2 = load <16 x half>, ptr %b
+ %res = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> %op1, <16 x half> %op2)
+ store <16 x half> %res, ptr %a
+ ret void
+}
+
+define <2 x float> @fminimumnum_v2f32(<2 x float> %op1, <2 x float> %op2) {
+; CHECK-LABEL: fminimumnum_v2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl2
+; CHECK-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fminimumnum_v2f32:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #32
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #12]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: fminnm s2, s1, s0
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #24]
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT: add sp, sp, #32
+; NONEON-NOSVE-NEXT: ret
+ %res = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> %op1, <2 x float> %op2)
+ ret <2 x float> %res
+}
+
+define <4 x float> @fminimumnum_v4f32(<4 x float> %op1, <4 x float> %op2) {
+; CHECK-LABEL: fminimumnum_v4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fminimumnum_v4f32:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #12]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: fminnm s2, s1, s0
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #4]
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #40]
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s2, s1, s0
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT: ldr s1, [sp]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #32]
+; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT: add sp, sp, #48
+; NONEON-NOSVE-NEXT: ret
+ %res = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> %op1, <4 x float> %op2)
+ ret <4 x float> %res
+}
+
+define void @fminimumnum_v8f32(ptr %a, ptr %b) {
+; CHECK-LABEL: fminimumnum_v8f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp q0, q3, [x1]
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: ldp q1, q2, [x0]
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: fminnm z3.s, p0/m, z3.s, z3.s
+; CHECK-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT: fminnm z2.s, p0/m, z2.s, z2.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: fminnm z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT: stp q0, q2, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fminimumnum_v8f32:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #96
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT: ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT: ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT: stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #44]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: fminnm s2, s1, s0
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #40]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #36]
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #88]
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s2, s1, s0
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #32]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #12]
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #80]
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s2, s1, s0
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: ldr s1, [sp, #4]
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #72]
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s2, s1, s0
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT: ldr s1, [sp]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT: fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #64]
+; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #96
+; NONEON-NOSVE-NEXT: ret
+ %op1 = load <8 x float>, ptr %a
+ %op2 = load <8 x float>, ptr %b
+ %res = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> %op1, <8 x float> %op2)
+ store <8 x float> %res, ptr %a
+ ret void
+}
+
+define <1 x double> @fminimumnum_v1f64(<1 x double> %op1, <1 x double> %op2) {
+; CHECK-LABEL: fminimumnum_v1f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm d1, d1, d1
+; CHECK-NEXT: fminnm d0, d0, d0
+; CHECK-NEXT: fminnm d0, d0, d1
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fminimumnum_v1f64:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #16
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT: fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT: fminnm d0, d0, d1
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %res = call <1 x double> @llvm.minimumnum.v1f64(<1 x double> %op1, <1 x double> %op2)
+ ret <1 x double> %res
+}
+
+define <2 x double> @fminimumnum_v2f64(<2 x double> %op1, <2 x double> %op2) {
+; CHECK-LABEL: fminimumnum_v2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fminimumnum_v2f64:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT: ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT: fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT: fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT: fminnm d2, d1, d0
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT: ldr d1, [sp]
+; NONEON-NOSVE-NEXT: fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT: fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT: fminnm d0, d1, d0
+; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #32]
+; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT: add sp, sp, #48
+; NONEON-NOSVE-NEXT: ret
+ %res = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %op1, <2 x double> %op2)
+ ret <2 x double> %res
+}
+
+define void @fminimumnum_v4f64(ptr %a, ptr %b) {
+; CHECK-LABEL: fminimumnum_v4f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp q0, q3, [x1]
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: ldp q1, q2, [x0]
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT: fminnm z3.d, p0/m, z3.d, z3.d
+; CHECK-NEXT: fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT: fminnm z2.d, p0/m, z2.d, z2.d
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: fminnm z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT: stp q0, q2, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fminimumnum_v4f64:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #96
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT: ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT: ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT: stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT: ldr d1, [sp, #40]
+; NONEON-NOSVE-NEXT: fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT: fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT: fminnm d2, d1, d0
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT: ldr d1, [sp, #32]
+; NONEON-NOSVE-NEXT: fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT: fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT: fminnm d0, d1, d0
+; NONEON-NOSVE-NEXT: ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT: fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT: fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT: fminnm d2, d1, d0
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT: ldr d1, [sp]
+; NONEON-NOSVE-NEXT: fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT: fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT: fminnm d0, d1, d0
+; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #64]
+; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #96
+; NONEON-NOSVE-NEXT: ret
+ %op1 = load <4 x double>, ptr %a
+ %op2 = load <4 x double>, ptr %b
+ %res = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> %op1, <4 x double> %op2)
+ store <4 x double> %res, ptr %a
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
index f278423..765ad87 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
@@ -2903,142 +2903,416 @@
ret void
}
-declare <2 x half> @llvm.ceil.v2f16(<2 x half>)
-declare <4 x half> @llvm.ceil.v4f16(<4 x half>)
-declare <8 x half> @llvm.ceil.v8f16(<8 x half>)
-declare <16 x half> @llvm.ceil.v16f16(<16 x half>)
-declare <32 x half> @llvm.ceil.v32f16(<32 x half>)
-declare <64 x half> @llvm.ceil.v64f16(<64 x half>)
-declare <128 x half> @llvm.ceil.v128f16(<128 x half>)
-declare <2 x float> @llvm.ceil.v2f32(<2 x float>)
-declare <4 x float> @llvm.ceil.v4f32(<4 x float>)
-declare <8 x float> @llvm.ceil.v8f32(<8 x float>)
-declare <16 x float> @llvm.ceil.v16f32(<16 x float>)
-declare <32 x float> @llvm.ceil.v32f32(<32 x float>)
-declare <64 x float> @llvm.ceil.v64f32(<64 x float>)
-declare <1 x double> @llvm.ceil.v1f64(<1 x double>)
-declare <2 x double> @llvm.ceil.v2f64(<2 x double>)
-declare <4 x double> @llvm.ceil.v4f64(<4 x double>)
-declare <8 x double> @llvm.ceil.v8f64(<8 x double>)
-declare <16 x double> @llvm.ceil.v16f64(<16 x double>)
-declare <32 x double> @llvm.ceil.v32f64(<32 x double>)
+;
+; FCANONICALIZE -> FMINNM
+;
-declare <2 x half> @llvm.floor.v2f16(<2 x half>)
-declare <4 x half> @llvm.floor.v4f16(<4 x half>)
-declare <8 x half> @llvm.floor.v8f16(<8 x half>)
-declare <16 x half> @llvm.floor.v16f16(<16 x half>)
-declare <32 x half> @llvm.floor.v32f16(<32 x half>)
-declare <64 x half> @llvm.floor.v64f16(<64 x half>)
-declare <128 x half> @llvm.floor.v128f16(<128 x half>)
-declare <2 x float> @llvm.floor.v2f32(<2 x float>)
-declare <4 x float> @llvm.floor.v4f32(<4 x float>)
-declare <8 x float> @llvm.floor.v8f32(<8 x float>)
-declare <16 x float> @llvm.floor.v16f32(<16 x float>)
-declare <32 x float> @llvm.floor.v32f32(<32 x float>)
-declare <64 x float> @llvm.floor.v64f32(<64 x float>)
-declare <1 x double> @llvm.floor.v1f64(<1 x double>)
-declare <2 x double> @llvm.floor.v2f64(<2 x double>)
-declare <4 x double> @llvm.floor.v4f64(<4 x double>)
-declare <8 x double> @llvm.floor.v8f64(<8 x double>)
-declare <16 x double> @llvm.floor.v16f64(<16 x double>)
-declare <32 x double> @llvm.floor.v32f64(<32 x double>)
+define <2 x half> @fcanonicalize_v2f16(<2 x half> %op) {
+; CHECK-LABEL: fcanonicalize_v2f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fcanonicalize_v2f16:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #14]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #12]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #10]
+; NONEON-NOSVE-NEXT: ldr h0, [sp]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #8]
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %res = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %op)
+ ret <2 x half> %res
+}
-declare <2 x half> @llvm.nearbyint.v2f16(<2 x half>)
-declare <4 x half> @llvm.nearbyint.v4f16(<4 x half>)
-declare <8 x half> @llvm.nearbyint.v8f16(<8 x half>)
-declare <16 x half> @llvm.nearbyint.v16f16(<16 x half>)
-declare <32 x half> @llvm.nearbyint.v32f16(<32 x half>)
-declare <64 x half> @llvm.nearbyint.v64f16(<64 x half>)
-declare <128 x half> @llvm.nearbyint.v128f16(<128 x half>)
-declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>)
-declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>)
-declare <8 x float> @llvm.nearbyint.v8f32(<8 x float>)
-declare <16 x float> @llvm.nearbyint.v16f32(<16 x float>)
-declare <32 x float> @llvm.nearbyint.v32f32(<32 x float>)
-declare <64 x float> @llvm.nearbyint.v64f32(<64 x float>)
-declare <1 x double> @llvm.nearbyint.v1f64(<1 x double>)
-declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>)
-declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>)
-declare <8 x double> @llvm.nearbyint.v8f64(<8 x double>)
-declare <16 x double> @llvm.nearbyint.v16f64(<16 x double>)
-declare <32 x double> @llvm.nearbyint.v32f64(<32 x double>)
+define <4 x half> @fcanonicalize_v4f16(<4 x half> %op) {
+; CHECK-LABEL: fcanonicalize_v4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fcanonicalize_v4f16:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #14]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #12]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #10]
+; NONEON-NOSVE-NEXT: ldr h0, [sp]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #8]
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %res = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %op)
+ ret <4 x half> %res
+}
-declare <2 x half> @llvm.rint.v2f16(<2 x half>)
-declare <4 x half> @llvm.rint.v4f16(<4 x half>)
-declare <8 x half> @llvm.rint.v8f16(<8 x half>)
-declare <16 x half> @llvm.rint.v16f16(<16 x half>)
-declare <32 x half> @llvm.rint.v32f16(<32 x half>)
-declare <64 x half> @llvm.rint.v64f16(<64 x half>)
-declare <128 x half> @llvm.rint.v128f16(<128 x half>)
-declare <2 x float> @llvm.rint.v2f32(<2 x float>)
-declare <4 x float> @llvm.rint.v4f32(<4 x float>)
-declare <8 x float> @llvm.rint.v8f32(<8 x float>)
-declare <16 x float> @llvm.rint.v16f32(<16 x float>)
-declare <32 x float> @llvm.rint.v32f32(<32 x float>)
-declare <64 x float> @llvm.rint.v64f32(<64 x float>)
-declare <1 x double> @llvm.rint.v1f64(<1 x double>)
-declare <2 x double> @llvm.rint.v2f64(<2 x double>)
-declare <4 x double> @llvm.rint.v4f64(<4 x double>)
-declare <8 x double> @llvm.rint.v8f64(<8 x double>)
-declare <16 x double> @llvm.rint.v16f64(<16 x double>)
-declare <32 x double> @llvm.rint.v32f64(<32 x double>)
+define <8 x half> @fcanonicalize_v8f16(<8 x half> %op) {
+; CHECK-LABEL: fcanonicalize_v8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fcanonicalize_v8f16:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #30]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #28]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #26]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #24]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #22]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #20]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #18]
+; NONEON-NOSVE-NEXT: ldr h0, [sp]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #16]
+; NONEON-NOSVE-NEXT: ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT: add sp, sp, #32
+; NONEON-NOSVE-NEXT: ret
+ %res = call <8 x half> @llvm.canonicalize.v8f16(<8 x half> %op)
+ ret <8 x half> %res
+}
-declare <2 x half> @llvm.round.v2f16(<2 x half>)
-declare <4 x half> @llvm.round.v4f16(<4 x half>)
-declare <8 x half> @llvm.round.v8f16(<8 x half>)
-declare <16 x half> @llvm.round.v16f16(<16 x half>)
-declare <32 x half> @llvm.round.v32f16(<32 x half>)
-declare <64 x half> @llvm.round.v64f16(<64 x half>)
-declare <128 x half> @llvm.round.v128f16(<128 x half>)
-declare <2 x float> @llvm.round.v2f32(<2 x float>)
-declare <4 x float> @llvm.round.v4f32(<4 x float>)
-declare <8 x float> @llvm.round.v8f32(<8 x float>)
-declare <16 x float> @llvm.round.v16f32(<16 x float>)
-declare <32 x float> @llvm.round.v32f32(<32 x float>)
-declare <64 x float> @llvm.round.v64f32(<64 x float>)
-declare <1 x double> @llvm.round.v1f64(<1 x double>)
-declare <2 x double> @llvm.round.v2f64(<2 x double>)
-declare <4 x double> @llvm.round.v4f64(<4 x double>)
-declare <8 x double> @llvm.round.v8f64(<8 x double>)
-declare <16 x double> @llvm.round.v16f64(<16 x double>)
-declare <32 x double> @llvm.round.v32f64(<32 x double>)
+define void @fcanonicalize_v16f16(ptr %a) {
+; CHECK-LABEL: fcanonicalize_v16f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp q0, q1, [x0]
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT: stp q0, q1, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fcanonicalize_v16f16:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #62]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #60]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #58]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #56]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #54]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #52]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #50]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #48]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #46]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #44]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #42]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #40]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #38]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #36]
+; NONEON-NOSVE-NEXT: ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #34]
+; NONEON-NOSVE-NEXT: ldr h0, [sp]
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: str h0, [sp, #32]
+; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #64
+; NONEON-NOSVE-NEXT: ret
+ %op = load <16 x half>, ptr %a
+ %res = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> %op)
+ store <16 x half> %res, ptr %a
+ ret void
+}
-declare <2 x half> @llvm.roundeven.v2f16(<2 x half>)
-declare <4 x half> @llvm.roundeven.v4f16(<4 x half>)
-declare <8 x half> @llvm.roundeven.v8f16(<8 x half>)
-declare <16 x half> @llvm.roundeven.v16f16(<16 x half>)
-declare <32 x half> @llvm.roundeven.v32f16(<32 x half>)
-declare <64 x half> @llvm.roundeven.v64f16(<64 x half>)
-declare <128 x half> @llvm.roundeven.v128f16(<128 x half>)
-declare <2 x float> @llvm.roundeven.v2f32(<2 x float>)
-declare <4 x float> @llvm.roundeven.v4f32(<4 x float>)
-declare <8 x float> @llvm.roundeven.v8f32(<8 x float>)
-declare <16 x float> @llvm.roundeven.v16f32(<16 x float>)
-declare <32 x float> @llvm.roundeven.v32f32(<32 x float>)
-declare <64 x float> @llvm.roundeven.v64f32(<64 x float>)
-declare <1 x double> @llvm.roundeven.v1f64(<1 x double>)
-declare <2 x double> @llvm.roundeven.v2f64(<2 x double>)
-declare <4 x double> @llvm.roundeven.v4f64(<4 x double>)
-declare <8 x double> @llvm.roundeven.v8f64(<8 x double>)
-declare <16 x double> @llvm.roundeven.v16f64(<16 x double>)
-declare <32 x double> @llvm.roundeven.v32f64(<32 x double>)
+define <2 x float> @fcanonicalize_v2f32(<2 x float> %op) {
+; CHECK-LABEL: fcanonicalize_v2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl2
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fcanonicalize_v2f32:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT: fminnm s1, s0, s0
+; NONEON-NOSVE-NEXT: ldr s0, [sp]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %res = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> %op)
+ ret <2 x float> %res
+}
-declare <2 x half> @llvm.trunc.v2f16(<2 x half>)
-declare <4 x half> @llvm.trunc.v4f16(<4 x half>)
-declare <8 x half> @llvm.trunc.v8f16(<8 x half>)
-declare <16 x half> @llvm.trunc.v16f16(<16 x half>)
-declare <32 x half> @llvm.trunc.v32f16(<32 x half>)
-declare <64 x half> @llvm.trunc.v64f16(<64 x half>)
-declare <128 x half> @llvm.trunc.v128f16(<128 x half>)
-declare <2 x float> @llvm.trunc.v2f32(<2 x float>)
-declare <4 x float> @llvm.trunc.v4f32(<4 x float>)
-declare <8 x float> @llvm.trunc.v8f32(<8 x float>)
-declare <16 x float> @llvm.trunc.v16f32(<16 x float>)
-declare <32 x float> @llvm.trunc.v32f32(<32 x float>)
-declare <64 x float> @llvm.trunc.v64f32(<64 x float>)
-declare <1 x double> @llvm.trunc.v1f64(<1 x double>)
-declare <2 x double> @llvm.trunc.v2f64(<2 x double>)
-declare <4 x double> @llvm.trunc.v4f64(<4 x double>)
-declare <8 x double> @llvm.trunc.v8f64(<8 x double>)
-declare <16 x double> @llvm.trunc.v16f64(<16 x double>)
-declare <32 x double> @llvm.trunc.v32f64(<32 x double>)
+define <4 x float> @fcanonicalize_v4f32(<4 x float> %op) {
+; CHECK-LABEL: fcanonicalize_v4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fcanonicalize_v4f32:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT: fminnm s1, s0, s0
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT: fminnm s1, s0, s0
+; NONEON-NOSVE-NEXT: ldr s0, [sp]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT: ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT: add sp, sp, #32
+; NONEON-NOSVE-NEXT: ret
+ %res = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %op)
+ ret <4 x float> %res
+}
+
+define void @fcanonicalize_v8f32(ptr %a) {
+; CHECK-LABEL: fcanonicalize_v8f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp q0, q1, [x0]
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT: fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT: stp q0, q1, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fcanonicalize_v8f32:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT: fminnm s1, s0, s0
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT: fminnm s1, s0, s0
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT: fminnm s1, s0, s0
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT: ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT: fminnm s1, s0, s0
+; NONEON-NOSVE-NEXT: ldr s0, [sp]
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #64
+; NONEON-NOSVE-NEXT: ret
+ %op = load <8 x float>, ptr %a
+ %res = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> %op)
+ store <8 x float> %res, ptr %a
+ ret void
+}
+
+define <1 x double> @fcanonicalize_v1f64(<1 x double> %op) {
+; CHECK-LABEL: fcanonicalize_v1f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm d0, d0, d0
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fcanonicalize_v1f64:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #16
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %res = call <1 x double> @llvm.canonicalize.v1f64(<1 x double> %op)
+ ret <1 x double> %res
+}
+
+define <2 x double> @fcanonicalize_v2f64(<2 x double> %op) {
+; CHECK-LABEL: fcanonicalize_v2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fcanonicalize_v2f64:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT: fminnm d1, d0, d0
+; NONEON-NOSVE-NEXT: ldr d0, [sp]
+; NONEON-NOSVE-NEXT: fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT: ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT: add sp, sp, #32
+; NONEON-NOSVE-NEXT: ret
+ %res = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> %op)
+ ret <2 x double> %res
+}
+
+define void @fcanonicalize_v4f64(ptr %a) {
+; CHECK-LABEL: fcanonicalize_v4f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp q0, q1, [x0]
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT: fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT: stp q0, q1, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: fcanonicalize_v4f64:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT: fminnm d1, d0, d0
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT: fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT: fminnm d1, d0, d0
+; NONEON-NOSVE-NEXT: ldr d0, [sp]
+; NONEON-NOSVE-NEXT: fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #64
+; NONEON-NOSVE-NEXT: ret
+ %op = load <4 x double>, ptr %a
+ %res = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> %op)
+ store <4 x double> %res, ptr %a
+ ret void
+}