[LLVM][CodeGen][SVE] Implement isel for maximumnum/minimumnum. (#185074)

Patch to add custom lowering for FCANONICALIZE, FMAXNUM_IEEE, and
FMINNUM_IEEE, all of which are required when relying on default
expansion of FMAXIMUMNUM and FMINIMUMNUM.
    
The lowering is very simple because AArch64's FMAXNM and FMINNM
instructions are IEEE754-2008 compliant, with the implementation
effectively follow the same path take for NEON.

NOTE: Bfloat support will be provided separately.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index fa8c19d..d1212f9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1761,13 +1761,16 @@
       setOperationAction(ISD::SELECT, VT, Custom);
       setOperationAction(ISD::SETCC, VT, Custom);
       setOperationAction(ISD::FADD, VT, Custom);
+      setOperationAction(ISD::FCANONICALIZE, VT, Custom);
       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
       setOperationAction(ISD::FDIV, VT, Custom);
       setOperationAction(ISD::FMA, VT, Custom);
       setOperationAction(ISD::FMAXIMUM, VT, Custom);
       setOperationAction(ISD::FMAXNUM, VT, Custom);
+      setOperationAction(ISD::FMAXNUM_IEEE, VT, Custom);
       setOperationAction(ISD::FMINIMUM, VT, Custom);
       setOperationAction(ISD::FMINNUM, VT, Custom);
+      setOperationAction(ISD::FMINNUM_IEEE, VT, Custom);
       setOperationAction(ISD::FMUL, VT, Custom);
       setOperationAction(ISD::FNEG, VT, Custom);
       setOperationAction(ISD::FSUB, VT, Custom);
@@ -2457,6 +2460,7 @@
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Default);
   setOperationAction(ISD::FABS, VT, Default);
   setOperationAction(ISD::FADD, VT, Default);
+  setOperationAction(ISD::FCANONICALIZE, VT, Default);
   setOperationAction(ISD::FCEIL, VT, Default);
   setOperationAction(ISD::FCOPYSIGN, VT, Default);
   setOperationAction(ISD::FDIV, VT, Default);
@@ -2464,8 +2468,10 @@
   setOperationAction(ISD::FMA, VT, Default);
   setOperationAction(ISD::FMAXIMUM, VT, Default);
   setOperationAction(ISD::FMAXNUM, VT, Default);
+  setOperationAction(ISD::FMAXNUM_IEEE, VT, Default);
   setOperationAction(ISD::FMINIMUM, VT, Default);
   setOperationAction(ISD::FMINNUM, VT, Default);
+  setOperationAction(ISD::FMINNUM_IEEE, VT, Default);
   setOperationAction(ISD::FMUL, VT, Default);
   setOperationAction(ISD::FNEARBYINT, VT, Default);
   setOperationAction(ISD::FNEG, VT, Default);
@@ -8393,10 +8399,12 @@
   case ISD::FMAXIMUM:
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
   case ISD::FMAXNUM:
+  case ISD::FMAXNUM_IEEE:
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
   case ISD::FMINIMUM:
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
   case ISD::FMINNUM:
+  case ISD::FMINNUM_IEEE:
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
   case ISD::VSELECT:
     return LowerFixedLengthVectorSelectToSVE(Op, DAG);
@@ -8488,6 +8496,8 @@
     return LowerPARTIAL_REDUCE_MLA(Op, DAG);
   case ISD::CLMUL:
     return LowerCLMUL(Op, DAG);
+  case ISD::FCANONICALIZE:
+    return LowerFCANONICALIZE(Op, DAG);
   }
 }
 
@@ -33502,3 +33512,25 @@
                                                    EVT VT) const {
   return Subtarget->hasCPA() && UseFEATCPACodegen;
 }
+
+SDValue AArch64TargetLowering::LowerFCANONICALIZE(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  assert(VT.isVector() && "Expected vector type!");
+
+  SDValue In = Op.getOperand(0);
+  SDValue Pg = getPredicateForVector(DAG, DL, VT);
+
+  // FMINNM follows IEEE754-2008 and will canonicalize a floating-point number.
+
+  if (VT.isScalableVector())
+    return DAG.getNode(AArch64ISD::FMINNM_PRED, DL, VT, Pg, In, In);
+
+  assert(useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) &&
+         "Expected to lower to SVE!");
+  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+  In = convertToScalableVector(DAG, ContainerVT, In);
+  In = DAG.getNode(AArch64ISD::FMINNM_PRED, DL, ContainerVT, Pg, In, In);
+  return convertFromScalableVector(DAG, VT, In);
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index d8b4d98..49ff76b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -786,7 +786,7 @@
   SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerMSTORE(SDValue Op, SelectionDAG &DAG) const;
-
+  SDValue LowerFCANONICALIZE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const;
 
   SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 2900cf0..8456694 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6138,6 +6138,8 @@
           (v2f64 (FMAXNMv2f64 (v2f64 V128:$Rn), (v2f64 V128:$Rm)))>;
 def : Pat<(v2f64 (fcanonicalize (v2f64 V128:$Rn))),
           (v2f64 (FMINNMv2f64 (v2f64 V128:$Rn), (v2f64 V128:$Rn)))>;
+def : Pat<(v1f64 (fcanonicalize (v1f64 V64:$Rn))),
+          (v1f64 (FMINNMDrr (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rn)))>;
 def : Pat<(v4f32 (fminnum_ieee (v4f32 V128:$Rn), (v4f32 V128:$Rm))),
           (v4f32 (FMINNMv4f32 (v4f32 V128:$Rn), (v4f32 V128:$Rm)))>;
 def : Pat<(v4f32 (fmaxnum_ieee (v4f32 V128:$Rn), (v4f32 V128:$Rm))),
diff --git a/llvm/test/CodeGen/AArch64/fp-fcanonicalize.ll b/llvm/test/CodeGen/AArch64/fp-fcanonicalize.ll
index 753e2b7..3199d5f 100644
--- a/llvm/test/CodeGen/AArch64/fp-fcanonicalize.ll
+++ b/llvm/test/CodeGen/AArch64/fp-fcanonicalize.ll
@@ -534,6 +534,54 @@
   ret double %z
 }
 
+define <1 x double> @fcanonicalize_v1f64(<1 x double> %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v1f64:
+; CHECK-NOFP16-NONEON:       // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT:    fminnm d0, d0, d0
+; CHECK-NOFP16-NONEON-NEXT:    ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_v1f64:
+; CHECK-FP16-NONEON:       // %bb.0:
+; CHECK-FP16-NONEON-NEXT:    fminnm d0, d0, d0
+; CHECK-FP16-NONEON-NEXT:    ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v1f64:
+; CHECK-NOFP16-NEON:       // %bb.0:
+; CHECK-NOFP16-NEON-NEXT:    fminnm d0, d0, d0
+; CHECK-NOFP16-NEON-NEXT:    ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_v1f64:
+; CHECK-FP16-NEON:       // %bb.0:
+; CHECK-FP16-NEON-NEXT:    fminnm d0, d0, d0
+; CHECK-FP16-NEON-NEXT:    ret
+  %z = call <1 x double> @llvm.canonicalize.v1f64(<1 x double> %x)
+  ret <1 x double> %z
+}
+
+define <1 x double> @fcanonicalize_v1f64_nnan(<1 x double> %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v1f64_nnan:
+; CHECK-NOFP16-NONEON:       // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT:    fminnm d0, d0, d0
+; CHECK-NOFP16-NONEON-NEXT:    ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_v1f64_nnan:
+; CHECK-FP16-NONEON:       // %bb.0:
+; CHECK-FP16-NONEON-NEXT:    fminnm d0, d0, d0
+; CHECK-FP16-NONEON-NEXT:    ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v1f64_nnan:
+; CHECK-NOFP16-NEON:       // %bb.0:
+; CHECK-NOFP16-NEON-NEXT:    fminnm d0, d0, d0
+; CHECK-NOFP16-NEON-NEXT:    ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_v1f64_nnan:
+; CHECK-FP16-NEON:       // %bb.0:
+; CHECK-FP16-NEON-NEXT:    fminnm d0, d0, d0
+; CHECK-FP16-NEON-NEXT:    ret
+  %z = call nnan <1 x double> @llvm.canonicalize.v1f64(<1 x double> %x)
+  ret <1 x double> %z
+}
+
 define <2 x double> @fcanonicalize_v2f64(<2 x double> %x) {
 ; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v2f64:
 ; CHECK-NOFP16-NONEON:       // %bb.0:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
index 90a0499..1ed569c 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
@@ -1197,80 +1197,696 @@
   ret void
 }
 
+;
+; FMAXIMUMNUM
+;
+
+; Don't use SVE for 64-bit vectors.
+define <4 x half> @fmaximumnum_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fmaximumnum_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v1.4h, v1.4h, v1.4h
+; CHECK-NEXT:    fminnm v0.4h, v0.4h, v0.4h
+; CHECK-NEXT:    fmaxnm v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+  %res = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> %op1, <4 x half> %op2)
+  ret <4 x half> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <8 x half> @fmaximumnum_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fmaximumnum_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v1.8h, v1.8h, v1.8h
+; CHECK-NEXT:    fminnm v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    fmaxnm v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> %op1, <8 x half> %op2)
+  ret <8 x half> %res
+}
+
+define void @fmaximumnum_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: fmaximumnum_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op1 = load <16 x half>, ptr %a
+  %op2 = load <16 x half>, ptr %b
+  %res = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> %op1, <16 x half> %op2)
+  store <16 x half> %res, ptr %a
+  ret void
+}
+
+define void @fmaximumnum_v32f16(ptr %a, ptr %b) #0 {
+; VBITS_EQ_256-LABEL: fmaximumnum_v32f16:
+; VBITS_EQ_256:       // %bb.0:
+; VBITS_EQ_256-NEXT:    ptrue p0.h, vl16
+; VBITS_EQ_256-NEXT:    mov x8, #16 // =0x10
+; VBITS_EQ_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
+; VBITS_EQ_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_EQ_256-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; VBITS_EQ_256-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; VBITS_EQ_256-NEXT:    fminnm z2.h, p0/m, z2.h, z2.h
+; VBITS_EQ_256-NEXT:    fminnm z3.h, p0/m, z3.h, z3.h
+; VBITS_EQ_256-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
+; VBITS_EQ_256-NEXT:    fmaxnm z2.h, p0/m, z2.h, z3.h
+; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    st1h { z2.h }, p0, [x0]
+; VBITS_EQ_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fmaximumnum_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; VBITS_GE_512-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; VBITS_GE_512-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
+  %op1 = load <32 x half>, ptr %a
+  %op2 = load <32 x half>, ptr %b
+  %res = call <32 x half> @llvm.maximumnum.v32f16(<32 x half> %op1, <32 x half> %op2)
+  store <32 x half> %res, ptr %a
+  ret void
+}
+
+define void @fmaximumnum_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fmaximumnum_v64f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op1 = load <64 x half>, ptr %a
+  %op2 = load <64 x half>, ptr %b
+  %res = call <64 x half> @llvm.maximumnum.v64f16(<64 x half> %op1, <64 x half> %op2)
+  store <64 x half> %res, ptr %a
+  ret void
+}
+
+define void @fmaximumnum_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fmaximumnum_v128f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op1 = load <128 x half>, ptr %a
+  %op2 = load <128 x half>, ptr %b
+  %res = call <128 x half> @llvm.maximumnum.v128f16(<128 x half> %op1, <128 x half> %op2)
+  store <128 x half> %res, ptr %a
+  ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define <2 x float> @fmaximumnum_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fmaximumnum_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v1.2s, v1.2s, v1.2s
+; CHECK-NEXT:    fminnm v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmaxnm v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+  %res = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> %op1, <2 x float> %op2)
+  ret <2 x float> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <4 x float> @fmaximumnum_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fmaximumnum_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v1.4s, v1.4s, v1.4s
+; CHECK-NEXT:    fminnm v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %res = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %op1, <4 x float> %op2)
+  ret <4 x float> %res
+}
+
+define void @fmaximumnum_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: fmaximumnum_v8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x float>, ptr %a
+  %op2 = load <8 x float>, ptr %b
+  %res = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> %op1, <8 x float> %op2)
+  store <8 x float> %res, ptr %a
+  ret void
+}
+
+define void @fmaximumnum_v16f32(ptr %a, ptr %b) #0 {
+; VBITS_EQ_256-LABEL: fmaximumnum_v16f32:
+; VBITS_EQ_256:       // %bb.0:
+; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
+; VBITS_EQ_256-NEXT:    mov x8, #8 // =0x8
+; VBITS_EQ_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
+; VBITS_EQ_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_EQ_256-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; VBITS_EQ_256-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; VBITS_EQ_256-NEXT:    fminnm z2.s, p0/m, z2.s, z2.s
+; VBITS_EQ_256-NEXT:    fminnm z3.s, p0/m, z3.s, z3.s
+; VBITS_EQ_256-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
+; VBITS_EQ_256-NEXT:    fmaxnm z2.s, p0/m, z2.s, z3.s
+; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    st1w { z2.s }, p0, [x0]
+; VBITS_EQ_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fmaximumnum_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; VBITS_GE_512-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; VBITS_GE_512-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
+  %op1 = load <16 x float>, ptr %a
+  %op2 = load <16 x float>, ptr %b
+  %res = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> %op1, <16 x float> %op2)
+  store <16 x float> %res, ptr %a
+  ret void
+}
+
+define void @fmaximumnum_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fmaximumnum_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op1 = load <32 x float>, ptr %a
+  %op2 = load <32 x float>, ptr %b
+  %res = call <32 x float> @llvm.maximumnum.v32f32(<32 x float> %op1, <32 x float> %op2)
+  store <32 x float> %res, ptr %a
+  ret void
+}
+
+define void @fmaximumnum_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fmaximumnum_v64f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op1 = load <64 x float>, ptr %a
+  %op2 = load <64 x float>, ptr %b
+  %res = call <64 x float> @llvm.maximumnum.v64f32(<64 x float> %op1, <64 x float> %op2)
+  store <64 x float> %res, ptr %a
+  ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define <1 x double> @fmaximumnum_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fmaximumnum_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm d1, d1, d1
+; CHECK-NEXT:    fminnm d0, d0, d0
+; CHECK-NEXT:    fmaxnm d0, d0, d1
+; CHECK-NEXT:    ret
+  %res = call <1 x double> @llvm.maximumnum.v1f64(<1 x double> %op1, <1 x double> %op2)
+  ret <1 x double> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <2 x double> @fmaximumnum_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fmaximumnum_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v1.2d, v1.2d, v1.2d
+; CHECK-NEXT:    fminnm v0.2d, v0.2d, v0.2d
+; CHECK-NEXT:    fmaxnm v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+  %res = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> %op1, <2 x double> %op2)
+  ret <2 x double> %res
+}
+
+define void @fmaximumnum_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: fmaximumnum_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op1 = load <4 x double>, ptr %a
+  %op2 = load <4 x double>, ptr %b
+  %res = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> %op1, <4 x double> %op2)
+  store <4 x double> %res, ptr %a
+  ret void
+}
+
+define void @fmaximumnum_v8f64(ptr %a, ptr %b) #0 {
+; VBITS_EQ_256-LABEL: fmaximumnum_v8f64:
+; VBITS_EQ_256:       // %bb.0:
+; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
+; VBITS_EQ_256-NEXT:    mov x8, #4 // =0x4
+; VBITS_EQ_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; VBITS_EQ_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_EQ_256-NEXT:    fminnm z1.d, p0/m, z1.d, z1.d
+; VBITS_EQ_256-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; VBITS_EQ_256-NEXT:    fminnm z2.d, p0/m, z2.d, z2.d
+; VBITS_EQ_256-NEXT:    fminnm z3.d, p0/m, z3.d, z3.d
+; VBITS_EQ_256-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_256-NEXT:    fmaxnm z2.d, p0/m, z2.d, z3.d
+; VBITS_EQ_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    st1d { z2.d }, p0, [x0]
+; VBITS_EQ_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fmaximumnum_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fminnm z1.d, p0/m, z1.d, z1.d
+; VBITS_GE_512-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; VBITS_GE_512-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
+  %op1 = load <8 x double>, ptr %a
+  %op2 = load <8 x double>, ptr %b
+  %res = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> %op1, <8 x double> %op2)
+  store <8 x double> %res, ptr %a
+  ret void
+}
+
+define void @fmaximumnum_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fmaximumnum_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op1 = load <16 x double>, ptr %a
+  %op2 = load <16 x double>, ptr %b
+  %res = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> %op1, <16 x double> %op2)
+  store <16 x double> %res, ptr %a
+  ret void
+}
+
+define void @fmaximumnum_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fmaximumnum_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op1 = load <32 x double>, ptr %a
+  %op2 = load <32 x double>, ptr %b
+  %res = call <32 x double> @llvm.maximumnum.v32f64(<32 x double> %op1, <32 x double> %op2)
+  store <32 x double> %res, ptr %a
+  ret void
+}
+
+;
+; FMINIMUMNUM
+;
+
+; Don't use SVE for 64-bit vectors.
+define <4 x half> @fminimumnum_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fminimumnum_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v1.4h, v1.4h, v1.4h
+; CHECK-NEXT:    fminnm v0.4h, v0.4h, v0.4h
+; CHECK-NEXT:    fminnm v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+  %res = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> %op1, <4 x half> %op2)
+  ret <4 x half> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <8 x half> @fminimumnum_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fminimumnum_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v1.8h, v1.8h, v1.8h
+; CHECK-NEXT:    fminnm v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    fminnm v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> %op1, <8 x half> %op2)
+  ret <8 x half> %res
+}
+
+define void @fminimumnum_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: fminimumnum_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op1 = load <16 x half>, ptr %a
+  %op2 = load <16 x half>, ptr %b
+  %res = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> %op1, <16 x half> %op2)
+  store <16 x half> %res, ptr %a
+  ret void
+}
+
+define void @fminimumnum_v32f16(ptr %a, ptr %b) #0 {
+; VBITS_EQ_256-LABEL: fminimumnum_v32f16:
+; VBITS_EQ_256:       // %bb.0:
+; VBITS_EQ_256-NEXT:    ptrue p0.h, vl16
+; VBITS_EQ_256-NEXT:    mov x8, #16 // =0x10
+; VBITS_EQ_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
+; VBITS_EQ_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_EQ_256-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; VBITS_EQ_256-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; VBITS_EQ_256-NEXT:    fminnm z2.h, p0/m, z2.h, z2.h
+; VBITS_EQ_256-NEXT:    fminnm z3.h, p0/m, z3.h, z3.h
+; VBITS_EQ_256-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
+; VBITS_EQ_256-NEXT:    fminnm z2.h, p0/m, z2.h, z3.h
+; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    st1h { z2.h }, p0, [x0]
+; VBITS_EQ_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fminimumnum_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; VBITS_GE_512-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; VBITS_GE_512-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
+  %op1 = load <32 x half>, ptr %a
+  %op2 = load <32 x half>, ptr %b
+  %res = call <32 x half> @llvm.minimumnum.v32f16(<32 x half> %op1, <32 x half> %op2)
+  store <32 x half> %res, ptr %a
+  ret void
+}
+
+define void @fminimumnum_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fminimumnum_v64f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op1 = load <64 x half>, ptr %a
+  %op2 = load <64 x half>, ptr %b
+  %res = call <64 x half> @llvm.minimumnum.v64f16(<64 x half> %op1, <64 x half> %op2)
+  store <64 x half> %res, ptr %a
+  ret void
+}
+
+define void @fminimumnum_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fminimumnum_v128f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op1 = load <128 x half>, ptr %a
+  %op2 = load <128 x half>, ptr %b
+  %res = call <128 x half> @llvm.minimumnum.v128f16(<128 x half> %op1, <128 x half> %op2)
+  store <128 x half> %res, ptr %a
+  ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define <2 x float> @fminimumnum_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fminimumnum_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v1.2s, v1.2s, v1.2s
+; CHECK-NEXT:    fminnm v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fminnm v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+  %res = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> %op1, <2 x float> %op2)
+  ret <2 x float> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <4 x float> @fminimumnum_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fminimumnum_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v1.4s, v1.4s, v1.4s
+; CHECK-NEXT:    fminnm v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    fminnm v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %res = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> %op1, <4 x float> %op2)
+  ret <4 x float> %res
+}
+
+define void @fminimumnum_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: fminimumnum_v8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x float>, ptr %a
+  %op2 = load <8 x float>, ptr %b
+  %res = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> %op1, <8 x float> %op2)
+  store <8 x float> %res, ptr %a
+  ret void
+}
+
+define void @fminimumnum_v16f32(ptr %a, ptr %b) #0 {
+; VBITS_EQ_256-LABEL: fminimumnum_v16f32:
+; VBITS_EQ_256:       // %bb.0:
+; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
+; VBITS_EQ_256-NEXT:    mov x8, #8 // =0x8
+; VBITS_EQ_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
+; VBITS_EQ_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_EQ_256-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; VBITS_EQ_256-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; VBITS_EQ_256-NEXT:    fminnm z2.s, p0/m, z2.s, z2.s
+; VBITS_EQ_256-NEXT:    fminnm z3.s, p0/m, z3.s, z3.s
+; VBITS_EQ_256-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
+; VBITS_EQ_256-NEXT:    fminnm z2.s, p0/m, z2.s, z3.s
+; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    st1w { z2.s }, p0, [x0]
+; VBITS_EQ_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fminimumnum_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; VBITS_GE_512-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; VBITS_GE_512-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
+  %op1 = load <16 x float>, ptr %a
+  %op2 = load <16 x float>, ptr %b
+  %res = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> %op1, <16 x float> %op2)
+  store <16 x float> %res, ptr %a
+  ret void
+}
+
+define void @fminimumnum_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fminimumnum_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op1 = load <32 x float>, ptr %a
+  %op2 = load <32 x float>, ptr %b
+  %res = call <32 x float> @llvm.minimumnum.v32f32(<32 x float> %op1, <32 x float> %op2)
+  store <32 x float> %res, ptr %a
+  ret void
+}
+
+define void @fminimumnum_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fminimumnum_v64f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op1 = load <64 x float>, ptr %a
+  %op2 = load <64 x float>, ptr %b
+  %res = call <64 x float> @llvm.minimumnum.v64f32(<64 x float> %op1, <64 x float> %op2)
+  store <64 x float> %res, ptr %a
+  ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define <1 x double> @fminimumnum_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fminimumnum_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm d1, d1, d1
+; CHECK-NEXT:    fminnm d0, d0, d0
+; CHECK-NEXT:    fminnm d0, d0, d1
+; CHECK-NEXT:    ret
+  %res = call <1 x double> @llvm.minimumnum.v1f64(<1 x double> %op1, <1 x double> %op2)
+  ret <1 x double> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <2 x double> @fminimumnum_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: fminimumnum_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v1.2d, v1.2d, v1.2d
+; CHECK-NEXT:    fminnm v0.2d, v0.2d, v0.2d
+; CHECK-NEXT:    fminnm v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+  %res = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %op1, <2 x double> %op2)
+  ret <2 x double> %res
+}
+
+define void @fminimumnum_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: fminimumnum_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op1 = load <4 x double>, ptr %a
+  %op2 = load <4 x double>, ptr %b
+  %res = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> %op1, <4 x double> %op2)
+  store <4 x double> %res, ptr %a
+  ret void
+}
+
+define void @fminimumnum_v8f64(ptr %a, ptr %b) #0 {
+; VBITS_EQ_256-LABEL: fminimumnum_v8f64:
+; VBITS_EQ_256:       // %bb.0:
+; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
+; VBITS_EQ_256-NEXT:    mov x8, #4 // =0x4
+; VBITS_EQ_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; VBITS_EQ_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_EQ_256-NEXT:    fminnm z1.d, p0/m, z1.d, z1.d
+; VBITS_EQ_256-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; VBITS_EQ_256-NEXT:    fminnm z2.d, p0/m, z2.d, z2.d
+; VBITS_EQ_256-NEXT:    fminnm z3.d, p0/m, z3.d, z3.d
+; VBITS_EQ_256-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_256-NEXT:    fminnm z2.d, p0/m, z2.d, z3.d
+; VBITS_EQ_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    st1d { z2.d }, p0, [x0]
+; VBITS_EQ_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fminimumnum_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fminnm z1.d, p0/m, z1.d, z1.d
+; VBITS_GE_512-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; VBITS_GE_512-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
+  %op1 = load <8 x double>, ptr %a
+  %op2 = load <8 x double>, ptr %b
+  %res = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> %op1, <8 x double> %op2)
+  store <8 x double> %res, ptr %a
+  ret void
+}
+
+define void @fminimumnum_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fminimumnum_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op1 = load <16 x double>, ptr %a
+  %op2 = load <16 x double>, ptr %b
+  %res = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> %op1, <16 x double> %op2)
+  store <16 x double> %res, ptr %a
+  ret void
+}
+
+define void @fminimumnum_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fminimumnum_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op1 = load <32 x double>, ptr %a
+  %op2 = load <32 x double>, ptr %b
+  %res = call <32 x double> @llvm.minimumnum.v32f64(<32 x double> %op1, <32 x double> %op2)
+  store <32 x double> %res, ptr %a
+  ret void
+}
+
 attributes #0 = { "target-features"="+sve" }
-
-declare <4 x half> @llvm.minnum.v4f16(<4 x half>, <4 x half>)
-declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>)
-declare <16 x half> @llvm.minnum.v16f16(<16 x half>, <16 x half>)
-declare <32 x half> @llvm.minnum.v32f16(<32 x half>, <32 x half>)
-declare <64 x half> @llvm.minnum.v64f16(<64 x half>, <64 x half>)
-declare <128 x half> @llvm.minnum.v128f16(<128 x half>, <128 x half>)
-declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>)
-declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>)
-declare <32 x float> @llvm.minnum.v32f32(<32 x float>, <32 x float>)
-declare <64 x float> @llvm.minnum.v64f32(<64 x float>, <64 x float>)
-declare <1 x double> @llvm.minnum.v1f64(<1 x double>, <1 x double>)
-declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>)
-declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>)
-declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>)
-declare <32 x double> @llvm.minnum.v32f64(<32 x double>, <32 x double>)
-
-declare <4 x half> @llvm.maxnum.v4f16(<4 x half>, <4 x half>)
-declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>)
-declare <16 x half> @llvm.maxnum.v16f16(<16 x half>, <16 x half>)
-declare <32 x half> @llvm.maxnum.v32f16(<32 x half>, <32 x half>)
-declare <64 x half> @llvm.maxnum.v64f16(<64 x half>, <64 x half>)
-declare <128 x half> @llvm.maxnum.v128f16(<128 x half>, <128 x half>)
-declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>)
-declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>)
-declare <32 x float> @llvm.maxnum.v32f32(<32 x float>, <32 x float>)
-declare <64 x float> @llvm.maxnum.v64f32(<64 x float>, <64 x float>)
-declare <1 x double> @llvm.maxnum.v1f64(<1 x double>, <1 x double>)
-declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>)
-declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>)
-declare <16 x double> @llvm.maxnum.v16f64(<16 x double>, <16 x double>)
-declare <32 x double> @llvm.maxnum.v32f64(<32 x double>, <32 x double>)
-
-declare <4 x half> @llvm.minimum.v4f16(<4 x half>, <4 x half>)
-declare <8 x half> @llvm.minimum.v8f16(<8 x half>, <8 x half>)
-declare <16 x half> @llvm.minimum.v16f16(<16 x half>, <16 x half>)
-declare <32 x half> @llvm.minimum.v32f16(<32 x half>, <32 x half>)
-declare <64 x half> @llvm.minimum.v64f16(<64 x half>, <64 x half>)
-declare <128 x half> @llvm.minimum.v128f16(<128 x half>, <128 x half>)
-declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.minimum.v8f32(<8 x float>, <8 x float>)
-declare <16 x float> @llvm.minimum.v16f32(<16 x float>, <16 x float>)
-declare <32 x float> @llvm.minimum.v32f32(<32 x float>, <32 x float>)
-declare <64 x float> @llvm.minimum.v64f32(<64 x float>, <64 x float>)
-declare <1 x double> @llvm.minimum.v1f64(<1 x double>, <1 x double>)
-declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>)
-declare <8 x double> @llvm.minimum.v8f64(<8 x double>, <8 x double>)
-declare <16 x double> @llvm.minimum.v16f64(<16 x double>, <16 x double>)
-declare <32 x double> @llvm.minimum.v32f64(<32 x double>, <32 x double>)
-
-declare <4 x half> @llvm.maximum.v4f16(<4 x half>, <4 x half>)
-declare <8 x half> @llvm.maximum.v8f16(<8 x half>, <8 x half>)
-declare <16 x half> @llvm.maximum.v16f16(<16 x half>, <16 x half>)
-declare <32 x half> @llvm.maximum.v32f16(<32 x half>, <32 x half>)
-declare <64 x half> @llvm.maximum.v64f16(<64 x half>, <64 x half>)
-declare <128 x half> @llvm.maximum.v128f16(<128 x half>, <128 x half>)
-declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.maximum.v8f32(<8 x float>, <8 x float>)
-declare <16 x float> @llvm.maximum.v16f32(<16 x float>, <16 x float>)
-declare <32 x float> @llvm.maximum.v32f32(<32 x float>, <32 x float>)
-declare <64 x float> @llvm.maximum.v64f32(<64 x float>, <64 x float>)
-declare <1 x double> @llvm.maximum.v1f64(<1 x double>, <1 x double>)
-declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>)
-declare <8 x double> @llvm.maximum.v8f64(<8 x double>, <8 x double>)
-declare <16 x double> @llvm.maximum.v16f64(<16 x double>, <16 x double>)
-declare <32 x double> @llvm.maximum.v32f64(<32 x double>, <32 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll
index 9ac6fc9..6fedf63 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll
@@ -1881,137 +1881,272 @@
   ret void
 }
 
+;
+; FCANONICALIZE -> FMINNM
+;
+
+; Don't use SVE for 64-bit vectors.
+define <4 x half> @fcanonicalize_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
+; CHECK-LABEL: fcanonicalize_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v0.4h, v0.4h, v0.4h
+; CHECK-NEXT:    ret
+  %res = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %op)
+  ret <4 x half> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <8 x half> @fcanonicalize_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
+; CHECK-LABEL: fcanonicalize_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @llvm.canonicalize.v8f16(<8 x half> %op)
+  ret <8 x half> %res
+}
+
+define void @fcanonicalize_v16f16(ptr %a) vscale_range(2,0) #0 {
+; CHECK-LABEL: fcanonicalize_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op = load <16 x half>, ptr %a
+  %res = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> %op)
+  store <16 x half> %res, ptr %a
+  ret void
+}
+
+define void @fcanonicalize_v32f16(ptr %a) #0 {
+; VBITS_GE_256-LABEL: fcanonicalize_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; VBITS_GE_256-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fcanonicalize_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
+  %op = load <32 x half>, ptr %a
+  %res = call <32 x half> @llvm.canonicalize.v32f16(<32 x half> %op)
+  store <32 x half> %res, ptr %a
+  ret void
+}
+
+define void @fcanonicalize_v64f16(ptr %a) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcanonicalize_v64f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op = load <64 x half>, ptr %a
+  %res = call <64 x half> @llvm.canonicalize.v64f16(<64 x half> %op)
+  store <64 x half> %res, ptr %a
+  ret void
+}
+
+define void @fcanonicalize_v128f16(ptr %a) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcanonicalize_v128f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op = load <128 x half>, ptr %a
+  %res = call <128 x half> @llvm.canonicalize.v128f16(<128 x half> %op)
+  store <128 x half> %res, ptr %a
+  ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define <2 x float> @fcanonicalize_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
+; CHECK-LABEL: fcanonicalize_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    ret
+  %res = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> %op)
+  ret <2 x float> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <4 x float> @fcanonicalize_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
+; CHECK-LABEL: fcanonicalize_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %res = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %op)
+  ret <4 x float> %res
+}
+
+define void @fcanonicalize_v8f32(ptr %a) vscale_range(2,0) #0 {
+; CHECK-LABEL: fcanonicalize_v8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op = load <8 x float>, ptr %a
+  %res = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> %op)
+  store <8 x float> %res, ptr %a
+  ret void
+}
+
+define void @fcanonicalize_v16f32(ptr %a) #0 {
+; VBITS_GE_256-LABEL: fcanonicalize_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; VBITS_GE_256-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fcanonicalize_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
+  %op = load <16 x float>, ptr %a
+  %res = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> %op)
+  store <16 x float> %res, ptr %a
+  ret void
+}
+
+define void @fcanonicalize_v32f32(ptr %a) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcanonicalize_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op = load <32 x float>, ptr %a
+  %res = call <32 x float> @llvm.canonicalize.v32f32(<32 x float> %op)
+  store <32 x float> %res, ptr %a
+  ret void
+}
+
+define void @fcanonicalize_v64f32(ptr %a) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcanonicalize_v64f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op = load <64 x float>, ptr %a
+  %res = call <64 x float> @llvm.canonicalize.v64f32(<64 x float> %op)
+  store <64 x float> %res, ptr %a
+  ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define <1 x double> @fcanonicalize_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
+; CHECK-LABEL: fcanonicalize_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm d0, d0, d0
+; CHECK-NEXT:    ret
+  %res = call <1 x double> @llvm.canonicalize.v1f64(<1 x double> %op)
+  ret <1 x double> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <2 x double> @fcanonicalize_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
+; CHECK-LABEL: fcanonicalize_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v0.2d, v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %res = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> %op)
+  ret <2 x double> %res
+}
+
+define void @fcanonicalize_v4f64(ptr %a) vscale_range(2,0) #0 {
+; CHECK-LABEL: fcanonicalize_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op = load <4 x double>, ptr %a
+  %res = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> %op)
+  store <4 x double> %res, ptr %a
+  ret void
+}
+
+define void @fcanonicalize_v8f64(ptr %a) #0 {
+; VBITS_GE_256-LABEL: fcanonicalize_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; VBITS_GE_256-NEXT:    fminnm z1.d, p0/m, z1.d, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fcanonicalize_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
+  %op = load <8 x double>, ptr %a
+  %res = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> %op)
+  store <8 x double> %res, ptr %a
+  ret void
+}
+
+define void @fcanonicalize_v16f64(ptr %a) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcanonicalize_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op = load <16 x double>, ptr %a
+  %res = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> %op)
+  store <16 x double> %res, ptr %a
+  ret void
+}
+
+define void @fcanonicalize_v32f64(ptr %a) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcanonicalize_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  %op = load <32 x double>, ptr %a
+  %res = call <32 x double> @llvm.canonicalize.v32f64(<32 x double> %op)
+  store <32 x double> %res, ptr %a
+  ret void
+}
+
 attributes #0 = { "target-features"="+sve" }
-
-declare <4 x half> @llvm.ceil.v4f16(<4 x half>)
-declare <8 x half> @llvm.ceil.v8f16(<8 x half>)
-declare <16 x half> @llvm.ceil.v16f16(<16 x half>)
-declare <32 x half> @llvm.ceil.v32f16(<32 x half>)
-declare <64 x half> @llvm.ceil.v64f16(<64 x half>)
-declare <128 x half> @llvm.ceil.v128f16(<128 x half>)
-declare <2 x float> @llvm.ceil.v2f32(<2 x float>)
-declare <4 x float> @llvm.ceil.v4f32(<4 x float>)
-declare <8 x float> @llvm.ceil.v8f32(<8 x float>)
-declare <16 x float> @llvm.ceil.v16f32(<16 x float>)
-declare <32 x float> @llvm.ceil.v32f32(<32 x float>)
-declare <64 x float> @llvm.ceil.v64f32(<64 x float>)
-declare <1 x double> @llvm.ceil.v1f64(<1 x double>)
-declare <2 x double> @llvm.ceil.v2f64(<2 x double>)
-declare <4 x double> @llvm.ceil.v4f64(<4 x double>)
-declare <8 x double> @llvm.ceil.v8f64(<8 x double>)
-declare <16 x double> @llvm.ceil.v16f64(<16 x double>)
-declare <32 x double> @llvm.ceil.v32f64(<32 x double>)
-
-declare <4 x half> @llvm.floor.v4f16(<4 x half>)
-declare <8 x half> @llvm.floor.v8f16(<8 x half>)
-declare <16 x half> @llvm.floor.v16f16(<16 x half>)
-declare <32 x half> @llvm.floor.v32f16(<32 x half>)
-declare <64 x half> @llvm.floor.v64f16(<64 x half>)
-declare <128 x half> @llvm.floor.v128f16(<128 x half>)
-declare <2 x float> @llvm.floor.v2f32(<2 x float>)
-declare <4 x float> @llvm.floor.v4f32(<4 x float>)
-declare <8 x float> @llvm.floor.v8f32(<8 x float>)
-declare <16 x float> @llvm.floor.v16f32(<16 x float>)
-declare <32 x float> @llvm.floor.v32f32(<32 x float>)
-declare <64 x float> @llvm.floor.v64f32(<64 x float>)
-declare <1 x double> @llvm.floor.v1f64(<1 x double>)
-declare <2 x double> @llvm.floor.v2f64(<2 x double>)
-declare <4 x double> @llvm.floor.v4f64(<4 x double>)
-declare <8 x double> @llvm.floor.v8f64(<8 x double>)
-declare <16 x double> @llvm.floor.v16f64(<16 x double>)
-declare <32 x double> @llvm.floor.v32f64(<32 x double>)
-
-declare <4 x half> @llvm.nearbyint.v4f16(<4 x half>)
-declare <8 x half> @llvm.nearbyint.v8f16(<8 x half>)
-declare <16 x half> @llvm.nearbyint.v16f16(<16 x half>)
-declare <32 x half> @llvm.nearbyint.v32f16(<32 x half>)
-declare <64 x half> @llvm.nearbyint.v64f16(<64 x half>)
-declare <128 x half> @llvm.nearbyint.v128f16(<128 x half>)
-declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>)
-declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>)
-declare <8 x float> @llvm.nearbyint.v8f32(<8 x float>)
-declare <16 x float> @llvm.nearbyint.v16f32(<16 x float>)
-declare <32 x float> @llvm.nearbyint.v32f32(<32 x float>)
-declare <64 x float> @llvm.nearbyint.v64f32(<64 x float>)
-declare <1 x double> @llvm.nearbyint.v1f64(<1 x double>)
-declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>)
-declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>)
-declare <8 x double> @llvm.nearbyint.v8f64(<8 x double>)
-declare <16 x double> @llvm.nearbyint.v16f64(<16 x double>)
-declare <32 x double> @llvm.nearbyint.v32f64(<32 x double>)
-
-declare <4 x half> @llvm.rint.v4f16(<4 x half>)
-declare <8 x half> @llvm.rint.v8f16(<8 x half>)
-declare <16 x half> @llvm.rint.v16f16(<16 x half>)
-declare <32 x half> @llvm.rint.v32f16(<32 x half>)
-declare <64 x half> @llvm.rint.v64f16(<64 x half>)
-declare <128 x half> @llvm.rint.v128f16(<128 x half>)
-declare <2 x float> @llvm.rint.v2f32(<2 x float>)
-declare <4 x float> @llvm.rint.v4f32(<4 x float>)
-declare <8 x float> @llvm.rint.v8f32(<8 x float>)
-declare <16 x float> @llvm.rint.v16f32(<16 x float>)
-declare <32 x float> @llvm.rint.v32f32(<32 x float>)
-declare <64 x float> @llvm.rint.v64f32(<64 x float>)
-declare <1 x double> @llvm.rint.v1f64(<1 x double>)
-declare <2 x double> @llvm.rint.v2f64(<2 x double>)
-declare <4 x double> @llvm.rint.v4f64(<4 x double>)
-declare <8 x double> @llvm.rint.v8f64(<8 x double>)
-declare <16 x double> @llvm.rint.v16f64(<16 x double>)
-declare <32 x double> @llvm.rint.v32f64(<32 x double>)
-
-declare <4 x half> @llvm.round.v4f16(<4 x half>)
-declare <8 x half> @llvm.round.v8f16(<8 x half>)
-declare <16 x half> @llvm.round.v16f16(<16 x half>)
-declare <32 x half> @llvm.round.v32f16(<32 x half>)
-declare <64 x half> @llvm.round.v64f16(<64 x half>)
-declare <128 x half> @llvm.round.v128f16(<128 x half>)
-declare <2 x float> @llvm.round.v2f32(<2 x float>)
-declare <4 x float> @llvm.round.v4f32(<4 x float>)
-declare <8 x float> @llvm.round.v8f32(<8 x float>)
-declare <16 x float> @llvm.round.v16f32(<16 x float>)
-declare <32 x float> @llvm.round.v32f32(<32 x float>)
-declare <64 x float> @llvm.round.v64f32(<64 x float>)
-declare <1 x double> @llvm.round.v1f64(<1 x double>)
-declare <2 x double> @llvm.round.v2f64(<2 x double>)
-declare <4 x double> @llvm.round.v4f64(<4 x double>)
-declare <8 x double> @llvm.round.v8f64(<8 x double>)
-declare <16 x double> @llvm.round.v16f64(<16 x double>)
-declare <32 x double> @llvm.round.v32f64(<32 x double>)
-
-declare <4 x half> @llvm.roundeven.v4f16(<4 x half>)
-declare <8 x half> @llvm.roundeven.v8f16(<8 x half>)
-declare <16 x half> @llvm.roundeven.v16f16(<16 x half>)
-declare <32 x half> @llvm.roundeven.v32f16(<32 x half>)
-declare <64 x half> @llvm.roundeven.v64f16(<64 x half>)
-declare <128 x half> @llvm.roundeven.v128f16(<128 x half>)
-declare <2 x float> @llvm.roundeven.v2f32(<2 x float>)
-declare <4 x float> @llvm.roundeven.v4f32(<4 x float>)
-declare <8 x float> @llvm.roundeven.v8f32(<8 x float>)
-declare <16 x float> @llvm.roundeven.v16f32(<16 x float>)
-declare <32 x float> @llvm.roundeven.v32f32(<32 x float>)
-declare <64 x float> @llvm.roundeven.v64f32(<64 x float>)
-declare <1 x double> @llvm.roundeven.v1f64(<1 x double>)
-declare <2 x double> @llvm.roundeven.v2f64(<2 x double>)
-declare <4 x double> @llvm.roundeven.v4f64(<4 x double>)
-declare <8 x double> @llvm.roundeven.v8f64(<8 x double>)
-declare <16 x double> @llvm.roundeven.v16f64(<16 x double>)
-declare <32 x double> @llvm.roundeven.v32f64(<32 x double>)
-
-declare <4 x half> @llvm.trunc.v4f16(<4 x half>)
-declare <8 x half> @llvm.trunc.v8f16(<8 x half>)
-declare <16 x half> @llvm.trunc.v16f16(<16 x half>)
-declare <32 x half> @llvm.trunc.v32f16(<32 x half>)
-declare <64 x half> @llvm.trunc.v64f16(<64 x half>)
-declare <128 x half> @llvm.trunc.v128f16(<128 x half>)
-declare <2 x float> @llvm.trunc.v2f32(<2 x float>)
-declare <4 x float> @llvm.trunc.v4f32(<4 x float>)
-declare <8 x float> @llvm.trunc.v8f32(<8 x float>)
-declare <16 x float> @llvm.trunc.v16f32(<16 x float>)
-declare <32 x float> @llvm.trunc.v32f32(<32 x float>)
-declare <64 x float> @llvm.trunc.v64f32(<64 x float>)
-declare <1 x double> @llvm.trunc.v1f64(<1 x double>)
-declare <2 x double> @llvm.trunc.v2f64(<2 x double>)
-declare <4 x double> @llvm.trunc.v4f64(<4 x double>)
-declare <8 x double> @llvm.trunc.v8f64(<8 x double>)
-declare <16 x double> @llvm.trunc.v16f64(<16 x double>)
-declare <32 x double> @llvm.trunc.v32f64(<32 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve-fp.ll b/llvm/test/CodeGen/AArch64/sve-fp.ll
index 2f3f99c..5cf8f82 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,SVE
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,SME
+
+target triple = "aarch64-unknown-linux-gnu"
 
 define <vscale x 8 x half> @fadd_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
 ; CHECK-LABEL: fadd_nxv8f16:
@@ -495,13 +498,20 @@
 %complex = type { { double, double } }
 
 define void @scalar_to_vector(ptr %outval, <vscale x 2 x i1> %pred, <vscale x 2 x double> %in1, <vscale x 2 x double> %in2) {
-; CHECK-LABEL: scalar_to_vector:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    faddv d0, p0, z0.d
-; CHECK-NEXT:    faddv d1, p0, z1.d
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; SVE-LABEL: scalar_to_vector:
+; SVE:       // %bb.0:
+; SVE-NEXT:    faddv d0, p0, z0.d
+; SVE-NEXT:    faddv d1, p0, z1.d
+; SVE-NEXT:    mov v0.d[1], v1.d[0]
+; SVE-NEXT:    str q0, [x0]
+; SVE-NEXT:    ret
+;
+; SME-LABEL: scalar_to_vector:
+; SME:       // %bb.0:
+; SME-NEXT:    faddv d0, p0, z0.d
+; SME-NEXT:    faddv d1, p0, z1.d
+; SME-NEXT:    stp d0, d1, [x0]
+; SME-NEXT:    ret
   %imagp = getelementptr inbounds %complex, ptr %outval, i64 0, i32 0, i32 1
   %1 = call double @llvm.aarch64.sve.faddv.nxv2f64(<vscale x 2 x i1> %pred, <vscale x 2 x double> %in1)
   %2 = call double @llvm.aarch64.sve.faddv.nxv2f64(<vscale x 2 x i1> %pred, <vscale x 2 x double> %in2)
@@ -1089,72 +1099,206 @@
   ret <vscale x 2 x double> %res
 }
 
-declare <vscale x 8 x half> @llvm.aarch64.sve.frecps.x.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 4 x float>  @llvm.aarch64.sve.frecps.x.nxv4f32(<vscale x 4 x float> , <vscale x 4 x float>)
-declare <vscale x 2 x double> @llvm.aarch64.sve.frecps.x.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 8 x half> @canonicalize_nxv8f16(<vscale x 8 x half> %a) {
+; CHECK-LABEL: canonicalize_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    ret
+  %r = call <vscale x 8 x half> @llvm.canonicalize.nxv8f16(<vscale x 8 x half> %a)
+  ret <vscale x 8 x half> %r
+}
 
-declare <vscale x 8 x half> @llvm.aarch64.sve.frsqrts.x.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 4 x float> @llvm.aarch64.sve.frsqrts.x.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 2 x double> @llvm.aarch64.sve.frsqrts.x.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 4 x half> @canonicalize_nxv4f16(<vscale x 4 x half> %a) {
+; CHECK-LABEL: canonicalize_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    ret
+  %r = call <vscale x 4 x half> @llvm.canonicalize.nxv4f16(<vscale x 4 x half> %a)
+  ret <vscale x 4 x half> %r
+}
 
-declare <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
-declare <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 2 x float> @llvm.fma.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 8 x half> @llvm.fma.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>)
-declare <vscale x 2 x half> @llvm.fma.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>)
+define <vscale x 2 x half> @canonicalize_nxv2f16(<vscale x 2 x half> %a) {
+; CHECK-LABEL: canonicalize_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    ret
+  %r = call <vscale x 2 x half> @llvm.canonicalize.nxv2f16(<vscale x 2 x half> %a)
+  ret <vscale x 2 x half> %r
+}
 
-declare <vscale x 8 x half> @llvm.sqrt.nxv8f16( <vscale x 8 x half>)
-declare <vscale x 4 x half> @llvm.sqrt.nxv4f16( <vscale x 4 x half>)
-declare <vscale x 2 x half> @llvm.sqrt.nxv2f16( <vscale x 2 x half>)
-declare <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float>)
-declare <vscale x 2 x float> @llvm.sqrt.nxv2f32(<vscale x 2 x float>)
-declare <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double>)
+define <vscale x 4 x float> @canonicalize_nxv4f32(<vscale x 4 x float> %a) {
+; CHECK-LABEL: canonicalize_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    ret
+  %r = call <vscale x 4 x float> @llvm.canonicalize.nxv4f32(<vscale x 4 x float> %a)
+  ret <vscale x 4 x float> %r
+}
 
-declare <vscale x 8 x half> @llvm.fabs.nxv8f16( <vscale x 8 x half>)
-declare <vscale x 4 x half> @llvm.fabs.nxv4f16( <vscale x 4 x half>)
-declare <vscale x 2 x half> @llvm.fabs.nxv2f16( <vscale x 2 x half>)
-declare <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float>)
-declare <vscale x 2 x float> @llvm.fabs.nxv2f32(<vscale x 2 x float>)
-declare <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double>)
+define <vscale x 2 x float> @canonicalize_nxv2f32(<vscale x 2 x float> %a) {
+; CHECK-LABEL: canonicalize_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    ret
+  %r = call <vscale x 2 x float> @llvm.canonicalize.nxv2f32(<vscale x 2 x float> %a)
+  ret <vscale x 2 x float> %r
+}
 
-declare <vscale x 16 x half> @llvm.maxnum.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>)
-declare <vscale x 8 x half> @llvm.maxnum.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 4 x half> @llvm.maxnum.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>)
-declare <vscale x 2 x half> @llvm.maxnum.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>)
-declare <vscale x 8 x float> @llvm.maxnum.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>)
-declare <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 2 x float> @llvm.maxnum.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 4 x double> @llvm.maxnum.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>)
-declare <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
-declare <vscale x 16 x half> @llvm.minnum.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>)
-declare <vscale x 8 x half> @llvm.minnum.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 4 x half> @llvm.minnum.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>)
-declare <vscale x 2 x half> @llvm.minnum.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>)
-declare <vscale x 8 x float> @llvm.minnum.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>)
-declare <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 2 x float> @llvm.minnum.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 4 x double> @llvm.minnum.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>)
-declare <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @canonicalize_nxv2f64(<vscale x 2 x double> %a) {
+; CHECK-LABEL: canonicalize_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT:    ret
+  %r = call <vscale x 2 x double> @llvm.canonicalize.nxv2f64(<vscale x 2 x double> %a)
+  ret <vscale x 2 x double> %r
+}
 
-declare <vscale x 16 x half> @llvm.maximum.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>)
-declare <vscale x 8 x half> @llvm.maximum.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 4 x half> @llvm.maximum.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>)
-declare <vscale x 2 x half> @llvm.maximum.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>)
-declare <vscale x 8 x float> @llvm.maximum.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>)
-declare <vscale x 4 x float> @llvm.maximum.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 2 x float> @llvm.maximum.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 4 x double> @llvm.maximum.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>)
-declare <vscale x 2 x double> @llvm.maximum.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
-declare <vscale x 16 x half> @llvm.minimum.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>)
-declare <vscale x 8 x half> @llvm.minimum.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 4 x half> @llvm.minimum.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>)
-declare <vscale x 2 x half> @llvm.minimum.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>)
-declare <vscale x 8 x float> @llvm.minimum.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>)
-declare <vscale x 4 x float> @llvm.minimum.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 2 x float> @llvm.minimum.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 4 x double> @llvm.minimum.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>)
-declare <vscale x 2 x double> @llvm.minimum.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 8 x half> @maximumnum_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: maximumnum_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x half> @llvm.maximumnum.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %res
+}
 
-; Function Attrs: nounwind readnone
-declare double @llvm.aarch64.sve.faddv.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>) #2
+define <vscale x 4 x half> @maximumnum_nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
+; CHECK-LABEL: maximumnum_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 4 x half> @llvm.maximumnum.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
+  ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @maximumnum_nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b) {
+; CHECK-LABEL: maximumnum_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 2 x half> @llvm.maximumnum.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
+  ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @maximumnum_nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: maximumnum_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %res = call <vscale x 4 x float> @llvm.maximumnum.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @maximumnum_nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b) {
+; CHECK-LABEL: maximumnum_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %res = call <vscale x 2 x float> @llvm.maximumnum.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
+  ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @maximumnum_nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: maximumnum_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %res = call <vscale x 2 x double> @llvm.maximumnum.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @minimumnum_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: minimumnum_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x half> @llvm.minimumnum.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @minimumnum_nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
+; CHECK-LABEL: minimumnum_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 4 x half> @llvm.minimumnum.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
+  ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @minimumnum_nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b) {
+; CHECK-LABEL: minimumnum_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 2 x half> @llvm.minimumnum.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
+  ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @minimumnum_nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: minimumnum_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %res = call <vscale x 4 x float> @llvm.minimumnum.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @minimumnum_nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b) {
+; CHECK-LABEL: minimumnum_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %res = call <vscale x 2 x float> @llvm.minimumnum.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
+  ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @minimumnum_nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: minimumnum_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %res = call <vscale x 2 x double> @llvm.minimumnum.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %res
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
index 369b698..da806dd 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
@@ -1841,42 +1841,1072 @@
   ret void
 }
 
-declare <4 x half> @llvm.minnum.v4f16(<4 x half>, <4 x half>)
-declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>)
-declare <16 x half> @llvm.minnum.v16f16(<16 x half>, <16 x half>)
-declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>)
-declare <1 x double> @llvm.minnum.v1f64(<1 x double>, <1 x double>)
-declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>)
+;
+; FMAXIMUMNUM
+;
 
-declare <4 x half> @llvm.maxnum.v4f16(<4 x half>, <4 x half>)
-declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>)
-declare <16 x half> @llvm.maxnum.v16f16(<16 x half>, <16 x half>)
-declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>)
-declare <1 x double> @llvm.maxnum.v1f64(<1 x double>, <1 x double>)
-declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>)
+define <4 x half> @fmaximumnum_v4f16(<4 x half> %op1, <4 x half> %op2) {
+; CHECK-LABEL: fmaximumnum_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumnum_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
+  %res = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> %op1, <4 x half> %op2)
+  ret <4 x half> %res
+}
 
-declare <4 x half> @llvm.minimum.v4f16(<4 x half>, <4 x half>)
-declare <8 x half> @llvm.minimum.v8f16(<8 x half>, <8 x half>)
-declare <16 x half> @llvm.minimum.v16f16(<16 x half>, <16 x half>)
-declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.minimum.v8f32(<8 x float>, <8 x float>)
-declare <1 x double> @llvm.minimum.v1f64(<1 x double>, <1 x double>)
-declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>)
+define <8 x half> @fmaximumnum_v8f16(<8 x half> %op1, <8 x half> %op2) {
+; CHECK-LABEL: fmaximumnum_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumnum_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
+  %res = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> %op1, <8 x half> %op2)
+  ret <8 x half> %res
+}
 
-declare <4 x half> @llvm.maximum.v4f16(<4 x half>, <4 x half>)
-declare <8 x half> @llvm.maximum.v8f16(<8 x half>, <8 x half>)
-declare <16 x half> @llvm.maximum.v16f16(<16 x half>, <16 x half>)
-declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.maximum.v8f32(<8 x float>, <8 x float>)
-declare <1 x double> @llvm.maximum.v1f64(<1 x double>, <1 x double>)
-declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>)
+define void @fmaximumnum_v16f16(ptr %a, ptr %b) {
+; CHECK-LABEL: fmaximumnum_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    fminnm z3.h, p0/m, z3.h, z3.h
+; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT:    fminnm z2.h, p0/m, z2.h, z2.h
+; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fmaxnm z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    stp q0, q2, [x0]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumnum_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
+  %op1 = load <16 x half>, ptr %a
+  %op2 = load <16 x half>, ptr %b
+  %res = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> %op1, <16 x half> %op2)
+  store <16 x half> %res, ptr %a
+  ret void
+}
+
+define <2 x float> @fmaximumnum_v2f32(<2 x float> %op1, <2 x float> %op2) {
+; CHECK-LABEL: fmaximumnum_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumnum_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
+  %res = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> %op1, <2 x float> %op2)
+  ret <2 x float> %res
+}
+
+define <4 x float> @fmaximumnum_v4f32(<4 x float> %op1, <4 x float> %op2) {
+; CHECK-LABEL: fmaximumnum_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumnum_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
+  %res = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %op1, <4 x float> %op2)
+  ret <4 x float> %res
+}
+
+define void @fmaximumnum_v8f32(ptr %a, ptr %b) {
+; CHECK-LABEL: fmaximumnum_v8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    fminnm z3.s, p0/m, z3.s, z3.s
+; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT:    fminnm z2.s, p0/m, z2.s, z2.s
+; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fmaxnm z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumnum_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
+  %op1 = load <8 x float>, ptr %a
+  %op2 = load <8 x float>, ptr %b
+  %res = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> %op1, <8 x float> %op2)
+  store <8 x float> %res, ptr %a
+  ret void
+}
+
+define <1 x double> @fmaximumnum_v1f64(<1 x double> %op1, <1 x double> %op2) {
+; CHECK-LABEL: fmaximumnum_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm d1, d1, d1
+; CHECK-NEXT:    fminnm d0, d0, d0
+; CHECK-NEXT:    fmaxnm d0, d0, d1
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumnum_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d0, d1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+  %res = call <1 x double> @llvm.maximumnum.v1f64(<1 x double> %op1, <1 x double> %op2)
+  ret <1 x double> %res
+}
+
+define <2 x double> @fmaximumnum_v2f64(<2 x double> %op1, <2 x double> %op2) {
+; CHECK-LABEL: fmaximumnum_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumnum_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT:    fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT:    fmaxnm d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT:    fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
+  %res = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> %op1, <2 x double> %op2)
+  ret <2 x double> %res
+}
+
+define void @fmaximumnum_v4f64(ptr %a, ptr %b) {
+; CHECK-LABEL: fmaximumnum_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT:    fminnm z3.d, p0/m, z3.d, z3.d
+; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT:    fminnm z2.d, p0/m, z2.d, z2.d
+; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fmaxnm z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumnum_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT:    fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT:    fmaxnm d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT:    fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT:    fmaxnm d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT:    fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
+  %op1 = load <4 x double>, ptr %a
+  %op2 = load <4 x double>, ptr %b
+  %res = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> %op1, <4 x double> %op2)
+  store <4 x double> %res, ptr %a
+  ret void
+}
+
+;
+; FMINIMUMNUM
+;
+
+define <4 x half> @fminimumnum_v4f16(<4 x half> %op1, <4 x half> %op2) {
+; CHECK-LABEL: fminimumnum_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumnum_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
+  %res = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> %op1, <4 x half> %op2)
+  ret <4 x half> %res
+}
+
+define <8 x half> @fminimumnum_v8f16(<8 x half> %op1, <8 x half> %op2) {
+; CHECK-LABEL: fminimumnum_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumnum_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
+  %res = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> %op1, <8 x half> %op2)
+  ret <8 x half> %res
+}
+
+define void @fminimumnum_v16f16(ptr %a, ptr %b) {
+; CHECK-LABEL: fminimumnum_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    fminnm z3.h, p0/m, z3.h, z3.h
+; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT:    fminnm z2.h, p0/m, z2.h, z2.h
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fminnm z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    stp q0, q2, [x0]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumnum_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
+  %op1 = load <16 x half>, ptr %a
+  %op2 = load <16 x half>, ptr %b
+  %res = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> %op1, <16 x half> %op2)
+  store <16 x half> %res, ptr %a
+  ret void
+}
+
+define <2 x float> @fminimumnum_v2f32(<2 x float> %op1, <2 x float> %op2) {
+; CHECK-LABEL: fminimumnum_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumnum_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    fminnm s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
+  %res = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> %op1, <2 x float> %op2)
+  ret <2 x float> %res
+}
+
+define <4 x float> @fminimumnum_v4f32(<4 x float> %op1, <4 x float> %op2) {
+; CHECK-LABEL: fminimumnum_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumnum_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    fminnm s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
+  %res = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> %op1, <4 x float> %op2)
+  ret <4 x float> %res
+}
+
+define void @fminimumnum_v8f32(ptr %a, ptr %b) {
+; CHECK-LABEL: fminimumnum_v8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    fminnm z3.s, p0/m, z3.s, z3.s
+; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT:    fminnm z2.s, p0/m, z2.s, z2.s
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fminnm z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumnum_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    fminnm s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
+  %op1 = load <8 x float>, ptr %a
+  %op2 = load <8 x float>, ptr %b
+  %res = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> %op1, <8 x float> %op2)
+  store <8 x float> %res, ptr %a
+  ret void
+}
+
+define <1 x double> @fminimumnum_v1f64(<1 x double> %op1, <1 x double> %op2) {
+; CHECK-LABEL: fminimumnum_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm d1, d1, d1
+; CHECK-NEXT:    fminnm d0, d0, d0
+; CHECK-NEXT:    fminnm d0, d0, d1
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumnum_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+  %res = call <1 x double> @llvm.minimumnum.v1f64(<1 x double> %op1, <1 x double> %op2)
+  ret <1 x double> %res
+}
+
+define <2 x double> @fminimumnum_v2f64(<2 x double> %op1, <2 x double> %op2) {
+; CHECK-LABEL: fminimumnum_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumnum_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT:    fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT:    fminnm d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT:    fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT:    fminnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
+  %res = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %op1, <2 x double> %op2)
+  ret <2 x double> %res
+}
+
+define void @fminimumnum_v4f64(ptr %a, ptr %b) {
+; CHECK-LABEL: fminimumnum_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT:    fminnm z3.d, p0/m, z3.d, z3.d
+; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT:    fminnm z2.d, p0/m, z2.d, z2.d
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fminnm z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumnum_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT:    fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT:    fminnm d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT:    fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT:    fminnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT:    fminnm d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT:    fminnm d1, d1, d1
+; NONEON-NOSVE-NEXT:    fminnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
+  %op1 = load <4 x double>, ptr %a
+  %op2 = load <4 x double>, ptr %b
+  %res = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> %op1, <4 x double> %op2)
+  store <4 x double> %res, ptr %a
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
index f278423..765ad87 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
@@ -2903,142 +2903,416 @@
   ret void
 }
 
-declare <2 x half> @llvm.ceil.v2f16(<2 x half>)
-declare <4 x half> @llvm.ceil.v4f16(<4 x half>)
-declare <8 x half> @llvm.ceil.v8f16(<8 x half>)
-declare <16 x half> @llvm.ceil.v16f16(<16 x half>)
-declare <32 x half> @llvm.ceil.v32f16(<32 x half>)
-declare <64 x half> @llvm.ceil.v64f16(<64 x half>)
-declare <128 x half> @llvm.ceil.v128f16(<128 x half>)
-declare <2 x float> @llvm.ceil.v2f32(<2 x float>)
-declare <4 x float> @llvm.ceil.v4f32(<4 x float>)
-declare <8 x float> @llvm.ceil.v8f32(<8 x float>)
-declare <16 x float> @llvm.ceil.v16f32(<16 x float>)
-declare <32 x float> @llvm.ceil.v32f32(<32 x float>)
-declare <64 x float> @llvm.ceil.v64f32(<64 x float>)
-declare <1 x double> @llvm.ceil.v1f64(<1 x double>)
-declare <2 x double> @llvm.ceil.v2f64(<2 x double>)
-declare <4 x double> @llvm.ceil.v4f64(<4 x double>)
-declare <8 x double> @llvm.ceil.v8f64(<8 x double>)
-declare <16 x double> @llvm.ceil.v16f64(<16 x double>)
-declare <32 x double> @llvm.ceil.v32f64(<32 x double>)
+;
+; FCANONICALIZE -> FMINNM
+;
 
-declare <2 x half> @llvm.floor.v2f16(<2 x half>)
-declare <4 x half> @llvm.floor.v4f16(<4 x half>)
-declare <8 x half> @llvm.floor.v8f16(<8 x half>)
-declare <16 x half> @llvm.floor.v16f16(<16 x half>)
-declare <32 x half> @llvm.floor.v32f16(<32 x half>)
-declare <64 x half> @llvm.floor.v64f16(<64 x half>)
-declare <128 x half> @llvm.floor.v128f16(<128 x half>)
-declare <2 x float> @llvm.floor.v2f32(<2 x float>)
-declare <4 x float> @llvm.floor.v4f32(<4 x float>)
-declare <8 x float> @llvm.floor.v8f32(<8 x float>)
-declare <16 x float> @llvm.floor.v16f32(<16 x float>)
-declare <32 x float> @llvm.floor.v32f32(<32 x float>)
-declare <64 x float> @llvm.floor.v64f32(<64 x float>)
-declare <1 x double> @llvm.floor.v1f64(<1 x double>)
-declare <2 x double> @llvm.floor.v2f64(<2 x double>)
-declare <4 x double> @llvm.floor.v4f64(<4 x double>)
-declare <8 x double> @llvm.floor.v8f64(<8 x double>)
-declare <16 x double> @llvm.floor.v16f64(<16 x double>)
-declare <32 x double> @llvm.floor.v32f64(<32 x double>)
+define <2 x half> @fcanonicalize_v2f16(<2 x half> %op) {
+; CHECK-LABEL: fcanonicalize_v2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcanonicalize_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+  %res = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %op)
+  ret <2 x half> %res
+}
 
-declare <2 x half> @llvm.nearbyint.v2f16(<2 x half>)
-declare <4 x half> @llvm.nearbyint.v4f16(<4 x half>)
-declare <8 x half> @llvm.nearbyint.v8f16(<8 x half>)
-declare <16 x half> @llvm.nearbyint.v16f16(<16 x half>)
-declare <32 x half> @llvm.nearbyint.v32f16(<32 x half>)
-declare <64 x half> @llvm.nearbyint.v64f16(<64 x half>)
-declare <128 x half> @llvm.nearbyint.v128f16(<128 x half>)
-declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>)
-declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>)
-declare <8 x float> @llvm.nearbyint.v8f32(<8 x float>)
-declare <16 x float> @llvm.nearbyint.v16f32(<16 x float>)
-declare <32 x float> @llvm.nearbyint.v32f32(<32 x float>)
-declare <64 x float> @llvm.nearbyint.v64f32(<64 x float>)
-declare <1 x double> @llvm.nearbyint.v1f64(<1 x double>)
-declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>)
-declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>)
-declare <8 x double> @llvm.nearbyint.v8f64(<8 x double>)
-declare <16 x double> @llvm.nearbyint.v16f64(<16 x double>)
-declare <32 x double> @llvm.nearbyint.v32f64(<32 x double>)
+define <4 x half> @fcanonicalize_v4f16(<4 x half> %op) {
+; CHECK-LABEL: fcanonicalize_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcanonicalize_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+  %res = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %op)
+  ret <4 x half> %res
+}
 
-declare <2 x half> @llvm.rint.v2f16(<2 x half>)
-declare <4 x half> @llvm.rint.v4f16(<4 x half>)
-declare <8 x half> @llvm.rint.v8f16(<8 x half>)
-declare <16 x half> @llvm.rint.v16f16(<16 x half>)
-declare <32 x half> @llvm.rint.v32f16(<32 x half>)
-declare <64 x half> @llvm.rint.v64f16(<64 x half>)
-declare <128 x half> @llvm.rint.v128f16(<128 x half>)
-declare <2 x float> @llvm.rint.v2f32(<2 x float>)
-declare <4 x float> @llvm.rint.v4f32(<4 x float>)
-declare <8 x float> @llvm.rint.v8f32(<8 x float>)
-declare <16 x float> @llvm.rint.v16f32(<16 x float>)
-declare <32 x float> @llvm.rint.v32f32(<32 x float>)
-declare <64 x float> @llvm.rint.v64f32(<64 x float>)
-declare <1 x double> @llvm.rint.v1f64(<1 x double>)
-declare <2 x double> @llvm.rint.v2f64(<2 x double>)
-declare <4 x double> @llvm.rint.v4f64(<4 x double>)
-declare <8 x double> @llvm.rint.v8f64(<8 x double>)
-declare <16 x double> @llvm.rint.v16f64(<16 x double>)
-declare <32 x double> @llvm.rint.v32f64(<32 x double>)
+define <8 x half> @fcanonicalize_v8f16(<8 x half> %op) {
+; CHECK-LABEL: fcanonicalize_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcanonicalize_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
+  %res = call <8 x half> @llvm.canonicalize.v8f16(<8 x half> %op)
+  ret <8 x half> %res
+}
 
-declare <2 x half> @llvm.round.v2f16(<2 x half>)
-declare <4 x half> @llvm.round.v4f16(<4 x half>)
-declare <8 x half> @llvm.round.v8f16(<8 x half>)
-declare <16 x half> @llvm.round.v16f16(<16 x half>)
-declare <32 x half> @llvm.round.v32f16(<32 x half>)
-declare <64 x half> @llvm.round.v64f16(<64 x half>)
-declare <128 x half> @llvm.round.v128f16(<128 x half>)
-declare <2 x float> @llvm.round.v2f32(<2 x float>)
-declare <4 x float> @llvm.round.v4f32(<4 x float>)
-declare <8 x float> @llvm.round.v8f32(<8 x float>)
-declare <16 x float> @llvm.round.v16f32(<16 x float>)
-declare <32 x float> @llvm.round.v32f32(<32 x float>)
-declare <64 x float> @llvm.round.v64f32(<64 x float>)
-declare <1 x double> @llvm.round.v1f64(<1 x double>)
-declare <2 x double> @llvm.round.v2f64(<2 x double>)
-declare <4 x double> @llvm.round.v4f64(<4 x double>)
-declare <8 x double> @llvm.round.v8f64(<8 x double>)
-declare <16 x double> @llvm.round.v16f64(<16 x double>)
-declare <32 x double> @llvm.round.v32f64(<32 x double>)
+define void @fcanonicalize_v16f16(ptr %a) {
+; CHECK-LABEL: fcanonicalize_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z0.h
+; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z1.h
+; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcanonicalize_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
+  %op = load <16 x half>, ptr %a
+  %res = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> %op)
+  store <16 x half> %res, ptr %a
+  ret void
+}
 
-declare <2 x half> @llvm.roundeven.v2f16(<2 x half>)
-declare <4 x half> @llvm.roundeven.v4f16(<4 x half>)
-declare <8 x half> @llvm.roundeven.v8f16(<8 x half>)
-declare <16 x half> @llvm.roundeven.v16f16(<16 x half>)
-declare <32 x half> @llvm.roundeven.v32f16(<32 x half>)
-declare <64 x half> @llvm.roundeven.v64f16(<64 x half>)
-declare <128 x half> @llvm.roundeven.v128f16(<128 x half>)
-declare <2 x float> @llvm.roundeven.v2f32(<2 x float>)
-declare <4 x float> @llvm.roundeven.v4f32(<4 x float>)
-declare <8 x float> @llvm.roundeven.v8f32(<8 x float>)
-declare <16 x float> @llvm.roundeven.v16f32(<16 x float>)
-declare <32 x float> @llvm.roundeven.v32f32(<32 x float>)
-declare <64 x float> @llvm.roundeven.v64f32(<64 x float>)
-declare <1 x double> @llvm.roundeven.v1f64(<1 x double>)
-declare <2 x double> @llvm.roundeven.v2f64(<2 x double>)
-declare <4 x double> @llvm.roundeven.v4f64(<4 x double>)
-declare <8 x double> @llvm.roundeven.v8f64(<8 x double>)
-declare <16 x double> @llvm.roundeven.v16f64(<16 x double>)
-declare <32 x double> @llvm.roundeven.v32f64(<32 x double>)
+define <2 x float> @fcanonicalize_v2f32(<2 x float> %op) {
+; CHECK-LABEL: fcanonicalize_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcanonicalize_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fminnm s1, s0, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+  %res = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> %op)
+  ret <2 x float> %res
+}
 
-declare <2 x half> @llvm.trunc.v2f16(<2 x half>)
-declare <4 x half> @llvm.trunc.v4f16(<4 x half>)
-declare <8 x half> @llvm.trunc.v8f16(<8 x half>)
-declare <16 x half> @llvm.trunc.v16f16(<16 x half>)
-declare <32 x half> @llvm.trunc.v32f16(<32 x half>)
-declare <64 x half> @llvm.trunc.v64f16(<64 x half>)
-declare <128 x half> @llvm.trunc.v128f16(<128 x half>)
-declare <2 x float> @llvm.trunc.v2f32(<2 x float>)
-declare <4 x float> @llvm.trunc.v4f32(<4 x float>)
-declare <8 x float> @llvm.trunc.v8f32(<8 x float>)
-declare <16 x float> @llvm.trunc.v16f32(<16 x float>)
-declare <32 x float> @llvm.trunc.v32f32(<32 x float>)
-declare <64 x float> @llvm.trunc.v64f32(<64 x float>)
-declare <1 x double> @llvm.trunc.v1f64(<1 x double>)
-declare <2 x double> @llvm.trunc.v2f64(<2 x double>)
-declare <4 x double> @llvm.trunc.v4f64(<4 x double>)
-declare <8 x double> @llvm.trunc.v8f64(<8 x double>)
-declare <16 x double> @llvm.trunc.v16f64(<16 x double>)
-declare <32 x double> @llvm.trunc.v32f64(<32 x double>)
+define <4 x float> @fcanonicalize_v4f32(<4 x float> %op) {
+; CHECK-LABEL: fcanonicalize_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcanonicalize_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fminnm s1, s0, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fminnm s1, s0, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
+  %res = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %op)
+  ret <4 x float> %res
+}
+
+define void @fcanonicalize_v8f32(ptr %a) {
+; CHECK-LABEL: fcanonicalize_v8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcanonicalize_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fminnm s1, s0, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fminnm s1, s0, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fminnm s1, s0, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fminnm s1, s0, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
+  %op = load <8 x float>, ptr %a
+  %res = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> %op)
+  store <8 x float> %res, ptr %a
+  ret void
+}
+
+define <1 x double> @fcanonicalize_v1f64(<1 x double> %op) {
+; CHECK-LABEL: fcanonicalize_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm d0, d0, d0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcanonicalize_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+  %res = call <1 x double> @llvm.canonicalize.v1f64(<1 x double> %op)
+  ret <1 x double> %res
+}
+
+define <2 x double> @fcanonicalize_v2f64(<2 x double> %op) {
+; CHECK-LABEL: fcanonicalize_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcanonicalize_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm d1, d0, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
+  %res = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> %op)
+  ret <2 x double> %res
+}
+
+define void @fcanonicalize_v4f64(ptr %a) {
+; CHECK-LABEL: fcanonicalize_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z1.d
+; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcanonicalize_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fminnm d1, d0, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm d1, d0, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
+  %op = load <4 x double>, ptr %a
+  %res = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> %op)
+  store <4 x double> %res, ptr %a
+  ret void
+}