[NVPTX] Scalarize v2f32 instructions if input operand guarantees need for register coalescing (#180113)

The support of f32 packed instructions in #126337 revealed performance
regressions on certain kernels. In one case, the cause comes from
loading a v4f32 from shared memory but then accessing them as {r0, r2}
and {r1, r3} from the full load of {r0, r1, r2, r3}.

This access pattern guarantees the registers requires a coalescing
operation which increases register pressure and degrades performance.
The fix here is to identify if we can prove that an v2f32 operand comes
from non-contiguous vector extracts and if so scalarizes the operation
so the coalescing operation is no longer needed.

I've found that ptxas can see through the extra unpacks/repacks of
contiguous registers this causes in MIR. However in the full test case
the packing of the final scalar->vector results does generate additional
costs especially since the only users unpack them. An additional MIR
pass is possible to catch the case

Assisted-by: Cursor / claude-4.6-opus-high

---------

Co-authored-by: Princeton Ferro <princetonferro@gmail.com>
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 8f1b705..f5554be 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -891,6 +891,10 @@
                        ISD::SIGN_EXTEND,
                        ISD::INTRINSIC_WO_CHAIN});
 
+  // If the vector operands require register coalescing, scalarize instead
+  if (STI.hasF32x2Instructions())
+    setTargetDAGCombine({ISD::FMA, ISD::FMUL, ISD::FSUB});
+
   // setcc for f16x2 and bf16x2 needs special handling to prevent
   // legalizer's attempt to scalarize it due to v2i1 not being legal.
   if (STI.allowFP16Math() || STI.hasBF16Math())
@@ -6135,14 +6139,126 @@
   return PerformADDCombineWithOperands(N, N1, N0, DCI);
 }
 
+/// Check if a v2f32 BUILD_VECTOR provably packs values from non-adjacent
+/// register pairs (non-coalescable).
+static bool isNonCoalescableBuildVector(const SDValue &BV) {
+  if (BV.getOpcode() != ISD::BUILD_VECTOR || BV.getValueType() != MVT::v2f32)
+    return false;
+
+  SDValue Elt0 = BV.getOperand(0);
+  SDValue Elt1 = BV.getOperand(1);
+
+  bool IsExt0 = Elt0.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
+  bool IsExt1 = Elt1.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
+
+  // If neither element is an EXTRACT_VECTOR_ELT they are free-standing
+  // scalars and the register allocator can still place them side-by-side.
+  if (!IsExt0 && !IsExt1)
+    return false;
+
+  // If exactly one element is an EXTRACT_VECTOR_ELT, the other is a scalar
+  // that cannot generally occupy the adjacent register slot.
+  if (IsExt0 != IsExt1)
+    return true;
+
+  // At this point both sources are extracting from vectors. If they are from
+  // different vectors, then the BUILD_VECTOR is non-coalescable.
+  SDValue Src0 = Elt0.getOperand(0);
+  SDValue Src1 = Elt1.getOperand(0);
+  if (Src0 != Src1)
+    return true;
+
+  auto *Idx0 = dyn_cast<ConstantSDNode>(Elt0.getOperand(1));
+  auto *Idx1 = dyn_cast<ConstantSDNode>(Elt1.getOperand(1));
+  // If both indices are dynamic they will be lowered to
+  // loads and the vector will be spilled to local memory. The register
+  // allocator can easily place the results in adjacent registers.
+  if (!Idx0 && !Idx1)
+    return false;
+
+  // If one index is dynamic and the other is constant, the value from the
+  // constant load will result in an additional register to pair with the result
+  // from the dynamic load. We consider this non-coalescable.
+  if ((Idx0 && !Idx1) || (!Idx0 && Idx1))
+    return true;
+
+  // Both are constant, adjacent pairs are coalescable
+  return std::abs(Idx0->getSExtValue() - Idx1->getSExtValue()) != 1;
+}
+
+/// Scalarize a v2f32 arithmetic node (FADD, FMUL, FSUB, FMA) when at least
+/// one operand is a BUILD_VECTOR that repacks values from non-adjacent register
+/// pairs.  Without this combine the BUILD_VECTOR forces allocation of a
+/// temporary 64-bit register, increasing register pressure.
+///
+/// Example - before:
+///   t0: v2f32,v2f32,ch = LoadV2 ...
+///   t1: f32 = extract_vector_elt t0, 0
+///   t2: f32 = extract_vector_elt t0:1, 0
+///   t3: v2f32 = BUILD_VECTOR t1, t2       ;; non-coalescable repack
+///   t4: v2f32 = fma t_a, t3, t_c
+///
+/// After:
+///   t0: v2f32,v2f32,ch = LoadV2 ...
+///   t1: f32 = extract_vector_elt t0, 0
+///   t2: f32 = extract_vector_elt t0:1, 0
+///   a0: f32 = extract_vector_elt t_a, 0
+///   a1: f32 = extract_vector_elt t_a, 1
+///   c0: f32 = extract_vector_elt t_c, 0
+///   c1: f32 = extract_vector_elt t_c, 1
+///   r0: f32 = fma a0, t1, c0
+///   r1: f32 = fma a1, t2, c1
+///   t4: v2f32 = BUILD_VECTOR r0, r1
+static SDValue PerformScalarizeV2F32Op(SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::v2f32)
+    return SDValue();
+
+  // Only scalarize when at least one operand is a BUILD_VECTOR whose elements
+  // are guaranteed to reside in different register pairs.
+  if (none_of(N->ops(), isNonCoalescableBuildVector))
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+  EVT EltVT = VT.getVectorElementType();
+  unsigned Opc = N->getOpcode();
+
+  // For each operand, get the scalar element at the given index: if the operand
+  // is a BUILD_VECTOR, grab the element directly; otherwise, emit an
+  // EXTRACT_VECTOR_ELT.
+  auto GetElement = [&](SDValue Op, unsigned Index) -> SDValue {
+    if (Op.getOpcode() == ISD::BUILD_VECTOR)
+      return Op.getOperand(Index);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op,
+                       DAG.getVectorIdxConstant(Index, DL));
+  };
+
+  // Build scalar operand lists for element 0 and element 1.
+  SmallVector<SDValue, 3> Ops0, Ops1;
+  for (const SDValue &Op : N->ops()) {
+    Ops0.push_back(GetElement(Op, 0));
+    Ops1.push_back(GetElement(Op, 1));
+  }
+
+  SDValue Res0 = DAG.getNode(Opc, DL, EltVT, Ops0, N->getFlags());
+  SDValue Res1 = DAG.getNode(Opc, DL, EltVT, Ops1, N->getFlags());
+
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Res0, Res1);
+}
+
 /// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
 ///
 static SDValue PerformFADDCombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 CodeGenOptLevel OptLevel) {
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  CodeGenOptLevel OptLevel) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
+  if (SDValue Result = PerformScalarizeV2F32Op(N, DCI))
+    return Result;
+
   EVT VT = N0.getValueType();
   if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
     return SDValue();
@@ -6993,6 +7109,10 @@
     return PerformEXTRACTCombine(N, DCI);
   case ISD::FADD:
     return PerformFADDCombine(N, DCI, OptLevel);
+  case ISD::FMA:
+  case ISD::FMUL:
+  case ISD::FSUB:
+    return PerformScalarizeV2F32Op(N, DCI);
   case ISD::FMAXNUM:
   case ISD::FMINNUM:
   case ISD::FMAXIMUM:
diff --git a/llvm/test/CodeGen/NVPTX/scalarize-non-coalescable-v2f32.ll b/llvm/test/CodeGen/NVPTX/scalarize-non-coalescable-v2f32.ll
new file mode 100644
index 0000000..f953b86
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/scalarize-non-coalescable-v2f32.ll
@@ -0,0 +1,356 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | FileCheck %s
+; RUN: %if ptxas-sm_100 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | %ptxas-verify -arch sm_100 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define <2 x float> @fma_non_coalescable(ptr addrspace(3) %p, <2 x float> %a, <2 x float> %c) {
+; CHECK-LABEL: fma_non_coalescable(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<11>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [fma_non_coalescable_param_0];
+; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [fma_non_coalescable_param_1];
+; CHECK-NEXT:    ld.param.v2.b32 {%r7, %r8}, [fma_non_coalescable_param_2];
+; CHECK-NEXT:    fma.rn.f32 %r9, %r6, %r3, %r8;
+; CHECK-NEXT:    fma.rn.f32 %r10, %r5, %r1, %r7;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r10, %r9};
+; CHECK-NEXT:    ret;
+  %ld  = load <4 x float>, ptr addrspace(3) %p, align 16
+  %e0  = extractelement <4 x float> %ld, i32 0
+  %e2  = extractelement <4 x float> %ld, i32 2
+  %bv0 = insertelement <2 x float> poison, float %e0, i32 0
+  %bv  = insertelement <2 x float> %bv0,  float %e2, i32 1
+  %mul = fmul <2 x float> %a, %bv
+  %res = fadd <2 x float> %mul, %c
+  ret <2 x float> %res
+}
+
+define <2 x float> @fma_constant_and_dynamic_index(i32 %idx, ptr addrspace(3) %p, <2 x float> %a, <2 x float> %c) {
+; CHECK-LABEL: fma_constant_and_dynamic_index(
+; CHECK:       {
+; CHECK-NEXT:    .local .align 8 .b8 __local_depot1[16];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b64 %rd<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b64 %SPL, __local_depot1;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.param.b32 %rd1, [fma_constant_and_dynamic_index_param_0];
+; CHECK-NEXT:    and.b64 %rd2, %rd1, 3;
+; CHECK-NEXT:    shl.b64 %rd3, %rd2, 2;
+; CHECK-NEXT:    add.u64 %rd4, %SP, 0;
+; CHECK-NEXT:    add.s64 %rd5, %rd4, %rd3;
+; CHECK-NEXT:    ld.param.b64 %rd6, [fma_constant_and_dynamic_index_param_1];
+; CHECK-NEXT:    ld.shared.v2.b64 {%rd7, %rd8}, [%rd6];
+; CHECK-NEXT:    mov.b64 {%r1, _}, %rd7;
+; CHECK-NEXT:    ld.param.v2.b32 {%r2, %r3}, [fma_constant_and_dynamic_index_param_2];
+; CHECK-NEXT:    st.b64 [%SP+8], %rd8;
+; CHECK-NEXT:    st.b64 [%SP], %rd7;
+; CHECK-NEXT:    ld.b32 %r4, [%rd5];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [fma_constant_and_dynamic_index_param_3];
+; CHECK-NEXT:    fma.rn.f32 %r7, %r2, %r1, %r5;
+; CHECK-NEXT:    fma.rn.f32 %r8, %r3, %r4, %r6;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r7, %r8};
+; CHECK-NEXT:    ret;
+  %ld  = load <4 x float>, ptr addrspace(3) %p, align 16
+  %e0  = extractelement <4 x float> %ld, i32 0
+  %e1  = extractelement <4 x float> %ld, i32 %idx
+  %bv0 = insertelement <2 x float> poison, float %e0, i32 0
+  %bv  = insertelement <2 x float> %bv0,  float %e1, i32 1
+  %mul = fmul <2 x float> %a, %bv
+  %res = fadd <2 x float> %mul, %c
+  ret <2 x float> %res
+}
+
+define <2 x float> @fma_shufflevector_non_coalescable(ptr addrspace(3) %p, <2 x float> %a, <2 x float> %c) {
+; CHECK-LABEL: fma_shufflevector_non_coalescable(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<11>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [fma_shufflevector_non_coalescable_param_0];
+; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [fma_shufflevector_non_coalescable_param_1];
+; CHECK-NEXT:    ld.param.v2.b32 {%r7, %r8}, [fma_shufflevector_non_coalescable_param_2];
+; CHECK-NEXT:    fma.rn.f32 %r9, %r6, %r3, %r8;
+; CHECK-NEXT:    fma.rn.f32 %r10, %r5, %r1, %r7;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r10, %r9};
+; CHECK-NEXT:    ret;
+  %ld  = load <4 x float>, ptr addrspace(3) %p, align 16
+  %bv  = shufflevector <4 x float> %ld, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %mul = fmul <2 x float> %a, %bv
+  %res = fadd <2 x float> %mul, %c
+  ret <2 x float> %res
+}
+
+define <2 x float> @fma_mixed_extract_and_scalar(ptr addrspace(3) %p, float %s, <2 x float> %a, <2 x float> %c) {
+; CHECK-LABEL: fma_mixed_extract_and_scalar(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<12>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [fma_mixed_extract_and_scalar_param_0];
+; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.param.b32 %r5, [fma_mixed_extract_and_scalar_param_1];
+; CHECK-NEXT:    ld.param.v2.b32 {%r6, %r7}, [fma_mixed_extract_and_scalar_param_2];
+; CHECK-NEXT:    ld.param.v2.b32 {%r8, %r9}, [fma_mixed_extract_and_scalar_param_3];
+; CHECK-NEXT:    fma.rn.f32 %r10, %r7, %r5, %r9;
+; CHECK-NEXT:    fma.rn.f32 %r11, %r6, %r3, %r8;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r11, %r10};
+; CHECK-NEXT:    ret;
+  %ld  = load <4 x float>, ptr addrspace(3) %p, align 16
+  %e2  = extractelement <4 x float> %ld, i32 2
+  %bv0 = insertelement <2 x float> poison, float %e2, i32 0
+  %bv  = insertelement <2 x float> %bv0,  float %s,  i32 1
+  %mul = fmul <2 x float> %a, %bv
+  %res = fadd <2 x float> %mul, %c
+  ret <2 x float> %res
+}
+
+define <2 x float> @fadd_non_coalescable(ptr addrspace(3) %p, <2 x float> %a) {
+; CHECK-LABEL: fadd_non_coalescable(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [fadd_non_coalescable_param_0];
+; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [fadd_non_coalescable_param_1];
+; CHECK-NEXT:    add.f32 %r7, %r6, %r3;
+; CHECK-NEXT:    add.f32 %r8, %r5, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
+; CHECK-NEXT:    ret;
+  %ld  = load <4 x float>, ptr addrspace(3) %p, align 16
+  %e0  = extractelement <4 x float> %ld, i32 0
+  %e2  = extractelement <4 x float> %ld, i32 2
+  %bv0 = insertelement <2 x float> poison, float %e0, i32 0
+  %bv  = insertelement <2 x float> %bv0,  float %e2, i32 1
+  %add = fadd <2 x float> %a, %bv
+  ret <2 x float> %add
+}
+
+define <2 x float> @fmul_non_coalescable(ptr addrspace(3) %p, <2 x float> %a) {
+; CHECK-LABEL: fmul_non_coalescable(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [fmul_non_coalescable_param_0];
+; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [fmul_non_coalescable_param_1];
+; CHECK-NEXT:    mul.f32 %r7, %r6, %r3;
+; CHECK-NEXT:    mul.f32 %r8, %r5, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
+; CHECK-NEXT:    ret;
+  %ld  = load <4 x float>, ptr addrspace(3) %p, align 16
+  %e0  = extractelement <4 x float> %ld, i32 0
+  %e2  = extractelement <4 x float> %ld, i32 2
+  %bv0 = insertelement <2 x float> poison, float %e0, i32 0
+  %bv  = insertelement <2 x float> %bv0,  float %e2, i32 1
+  %mul = fmul <2 x float> %a, %bv
+  ret <2 x float> %mul
+}
+
+define <2 x float> @fsub_non_coalescable(ptr addrspace(3) %p, <2 x float> %a) {
+; CHECK-LABEL: fsub_non_coalescable(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [fsub_non_coalescable_param_0];
+; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [fsub_non_coalescable_param_1];
+; CHECK-NEXT:    sub.f32 %r7, %r6, %r3;
+; CHECK-NEXT:    sub.f32 %r8, %r5, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
+; CHECK-NEXT:    ret;
+  %ld  = load <4 x float>, ptr addrspace(3) %p, align 16
+  %e0  = extractelement <4 x float> %ld, i32 0
+  %e2  = extractelement <4 x float> %ld, i32 2
+  %bv0 = insertelement <2 x float> poison, float %e0, i32 0
+  %bv  = insertelement <2 x float> %bv0,  float %e2, i32 1
+  %sub = fsub <2 x float> %a, %bv
+  ret <2 x float> %sub
+}
+
+; The rest of these tests should remain vectorized
+define <4 x float> @fma_adjacent_elements(ptr addrspace(3) %p, <2 x float> %a, <2 x float> %c) {
+; CHECK-LABEL: fma_adjacent_elements(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [fma_adjacent_elements_param_0];
+; CHECK-NEXT:    ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd4, [fma_adjacent_elements_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd5, [fma_adjacent_elements_param_2];
+; CHECK-NEXT:    fma.rn.f32x2 %rd6, %rd4, %rd2, %rd5;
+; CHECK-NEXT:    fma.rn.f32x2 %rd7, %rd4, %rd3, %rd5;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd6, %rd7};
+; CHECK-NEXT:    ret;
+  %ld  = load <4 x float>, ptr addrspace(3) %p, align 16
+  %e0  = extractelement <4 x float> %ld, i32 0
+  %e1  = extractelement <4 x float> %ld, i32 1
+  %e2  = extractelement <4 x float> %ld, i32 2
+  %e3  = extractelement <4 x float> %ld, i32 3
+  %lo0 = insertelement <2 x float> poison, float %e0, i32 0
+  %lo  = insertelement <2 x float> %lo0,  float %e1, i32 1
+  %hi0 = insertelement <2 x float> poison, float %e2, i32 0
+  %hi  = insertelement <2 x float> %hi0,  float %e3, i32 1
+  %mul_lo = fmul <2 x float> %a, %lo
+  %res_lo = fadd <2 x float> %mul_lo, %c
+  %mul_hi = fmul <2 x float> %a, %hi
+  %res_hi = fadd <2 x float> %mul_hi, %c
+  %out = shufflevector <2 x float> %res_lo, <2 x float> %res_hi, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %out
+}
+
+define <4 x float> @fma_adjacent_swapped_elements(ptr addrspace(3) %p, <2 x float> %a, <2 x float> %c) {
+; CHECK-LABEL: fma_adjacent_swapped_elements(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [fma_adjacent_swapped_elements_param_0];
+; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [fma_adjacent_swapped_elements_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd3, [fma_adjacent_swapped_elements_param_2];
+; CHECK-NEXT:    mov.b64 %rd4, {%r2, %r1};
+; CHECK-NEXT:    mov.b64 %rd5, {%r4, %r3};
+; CHECK-NEXT:    fma.rn.f32x2 %rd6, %rd2, %rd4, %rd3;
+; CHECK-NEXT:    fma.rn.f32x2 %rd7, %rd2, %rd5, %rd3;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd6, %rd7};
+; CHECK-NEXT:    ret;
+  %ld  = load <4 x float>, ptr addrspace(3) %p, align 16
+  %e0  = extractelement <4 x float> %ld, i32 0
+  %e1  = extractelement <4 x float> %ld, i32 1
+  %e2  = extractelement <4 x float> %ld, i32 2
+  %e3  = extractelement <4 x float> %ld, i32 3
+  %lo0 = insertelement <2 x float> poison, float %e1, i32 0
+  %lo  = insertelement <2 x float> %lo0,  float %e0, i32 1
+  %hi0 = insertelement <2 x float> poison, float %e3, i32 0
+  %hi  = insertelement <2 x float> %hi0,  float %e2, i32 1
+  %mul_lo = fmul <2 x float> %a, %lo
+  %res_lo = fadd <2 x float> %mul_lo, %c
+  %mul_hi = fmul <2 x float> %a, %hi
+  %res_hi = fadd <2 x float> %mul_hi, %c
+  %out = shufflevector <2 x float> %res_lo, <2 x float> %res_hi, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %out
+}
+
+define <2 x float> @fma_both_dynamic_indices(i32 %i0, i32 %i1, ptr addrspace(3) %p, <2 x float> %a, <2 x float> %c) {
+; CHECK-LABEL: fma_both_dynamic_indices(
+; CHECK:       {
+; CHECK-NEXT:    .local .align 8 .b8 __local_depot9[32];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<18>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b64 %SPL, __local_depot9;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.param.b32 %rd1, [fma_both_dynamic_indices_param_0];
+; CHECK-NEXT:    and.b64 %rd2, %rd1, 3;
+; CHECK-NEXT:    shl.b64 %rd3, %rd2, 2;
+; CHECK-NEXT:    add.u64 %rd4, %SP, 16;
+; CHECK-NEXT:    add.s64 %rd5, %rd4, %rd3;
+; CHECK-NEXT:    ld.param.b32 %rd6, [fma_both_dynamic_indices_param_1];
+; CHECK-NEXT:    and.b64 %rd7, %rd6, 3;
+; CHECK-NEXT:    shl.b64 %rd8, %rd7, 2;
+; CHECK-NEXT:    add.u64 %rd9, %SP, 0;
+; CHECK-NEXT:    add.s64 %rd10, %rd9, %rd8;
+; CHECK-NEXT:    ld.param.b64 %rd11, [fma_both_dynamic_indices_param_2];
+; CHECK-NEXT:    ld.shared.v2.b64 {%rd12, %rd13}, [%rd11];
+; CHECK-NEXT:    st.b64 [%SP+24], %rd13;
+; CHECK-NEXT:    st.b64 [%SP+16], %rd12;
+; CHECK-NEXT:    ld.b32 %r1, [%rd5];
+; CHECK-NEXT:    st.b64 [%SP+8], %rd13;
+; CHECK-NEXT:    st.b64 [%SP], %rd12;
+; CHECK-NEXT:    ld.b32 %r2, [%rd10];
+; CHECK-NEXT:    ld.param.b64 %rd14, [fma_both_dynamic_indices_param_3];
+; CHECK-NEXT:    ld.param.b64 %rd15, [fma_both_dynamic_indices_param_4];
+; CHECK-NEXT:    mov.b64 %rd16, {%r1, %r2};
+; CHECK-NEXT:    fma.rn.f32x2 %rd17, %rd14, %rd16, %rd15;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd17;
+; CHECK-NEXT:    ret;
+  %ld  = load <4 x float>, ptr addrspace(3) %p, align 16
+  %e0  = extractelement <4 x float> %ld, i32 %i0
+  %e1  = extractelement <4 x float> %ld, i32 %i1
+  %bv0 = insertelement <2 x float> poison, float %e0, i32 0
+  %bv  = insertelement <2 x float> %bv0,  float %e1, i32 1
+  %mul = fmul <2 x float> %a, %bv
+  %res = fadd <2 x float> %mul, %c
+  ret <2 x float> %res
+}
+
+define <2 x float> @fma_shufflevector_adjacent_elements(ptr addrspace(3) %p, <2 x float> %a, <2 x float> %c) {
+; CHECK-LABEL: fma_shufflevector_adjacent_elements(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [fma_shufflevector_adjacent_elements_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [fma_shufflevector_adjacent_elements_param_1];
+; CHECK-NEXT:    ld.shared.b64 %rd3, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd4, [fma_shufflevector_adjacent_elements_param_2];
+; CHECK-NEXT:    fma.rn.f32x2 %rd5, %rd2, %rd3, %rd4;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd5;
+; CHECK-NEXT:    ret;
+  %ld  = load <4 x float>, ptr addrspace(3) %p, align 16
+  %bv  = shufflevector <4 x float> %ld, <4 x float> poison, <2 x i32> <i32 0, i32 1>
+  %mul = fmul <2 x float> %a, %bv
+  %res = fadd <2 x float> %mul, %c
+  ret <2 x float> %res
+}
+
+define <2 x float> @fma_naturally_paired(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; CHECK-LABEL: fma_naturally_paired(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [fma_naturally_paired_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [fma_naturally_paired_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd3, [fma_naturally_paired_param_2];
+; CHECK-NEXT:    fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd4;
+; CHECK-NEXT:    ret;
+  %mul = fmul <2 x float> %a, %b
+  %res = fadd <2 x float> %mul, %c
+  ret <2 x float> %res
+}
+
+define <2 x float> @fma_coalescable_scalars(float %x, float %y, <2 x float> %b, <2 x float> %c) {
+; CHECK-LABEL: fma_coalescable_scalars(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [fma_coalescable_scalars_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [fma_coalescable_scalars_param_1];
+; CHECK-NEXT:    mov.b64 %rd1, {%r1, %r2};
+; CHECK-NEXT:    ld.param.b64 %rd2, [fma_coalescable_scalars_param_2];
+; CHECK-NEXT:    ld.param.b64 %rd3, [fma_coalescable_scalars_param_3];
+; CHECK-NEXT:    fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd4;
+; CHECK-NEXT:    ret;
+  %v0  = insertelement <2 x float> poison, float %x, i32 0
+  %v   = insertelement <2 x float> %v0,  float %y, i32 1
+  %mul = fmul <2 x float> %v, %b
+  %res = fadd <2 x float> %mul, %c
+  ret <2 x float> %res
+}