[NVPTX] Scalarize v2f32 instructions if input operand guarantees need for register coalescing (#180113)
The support of f32 packed instructions in #126337 revealed performance
regressions on certain kernels. In one case, the cause comes from
loading a v4f32 from shared memory but then accessing them as {r0, r2}
and {r1, r3} from the full load of {r0, r1, r2, r3}.
This access pattern guarantees the registers requires a coalescing
operation which increases register pressure and degrades performance.
The fix here is to identify if we can prove that an v2f32 operand comes
from non-contiguous vector extracts and if so scalarizes the operation
so the coalescing operation is no longer needed.
I've found that ptxas can see through the extra unpacks/repacks of
contiguous registers this causes in MIR. However in the full test case
the packing of the final scalar->vector results does generate additional
costs especially since the only users unpack them. An additional MIR
pass is possible to catch the case
Assisted-by: Cursor / claude-4.6-opus-high
---------
Co-authored-by: Princeton Ferro <princetonferro@gmail.com>diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 8f1b705..f5554be 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -891,6 +891,10 @@
ISD::SIGN_EXTEND,
ISD::INTRINSIC_WO_CHAIN});
+ // If the vector operands require register coalescing, scalarize instead
+ if (STI.hasF32x2Instructions())
+ setTargetDAGCombine({ISD::FMA, ISD::FMUL, ISD::FSUB});
+
// setcc for f16x2 and bf16x2 needs special handling to prevent
// legalizer's attempt to scalarize it due to v2i1 not being legal.
if (STI.allowFP16Math() || STI.hasBF16Math())
@@ -6135,14 +6139,126 @@
return PerformADDCombineWithOperands(N, N1, N0, DCI);
}
+/// Check if a v2f32 BUILD_VECTOR provably packs values from non-adjacent
+/// register pairs (non-coalescable).
+static bool isNonCoalescableBuildVector(const SDValue &BV) {
+ if (BV.getOpcode() != ISD::BUILD_VECTOR || BV.getValueType() != MVT::v2f32)
+ return false;
+
+ SDValue Elt0 = BV.getOperand(0);
+ SDValue Elt1 = BV.getOperand(1);
+
+ bool IsExt0 = Elt0.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
+ bool IsExt1 = Elt1.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
+
+ // If neither element is an EXTRACT_VECTOR_ELT they are free-standing
+ // scalars and the register allocator can still place them side-by-side.
+ if (!IsExt0 && !IsExt1)
+ return false;
+
+ // If exactly one element is an EXTRACT_VECTOR_ELT, the other is a scalar
+ // that cannot generally occupy the adjacent register slot.
+ if (IsExt0 != IsExt1)
+ return true;
+
+ // At this point both sources are extracting from vectors. If they are from
+ // different vectors, then the BUILD_VECTOR is non-coalescable.
+ SDValue Src0 = Elt0.getOperand(0);
+ SDValue Src1 = Elt1.getOperand(0);
+ if (Src0 != Src1)
+ return true;
+
+ auto *Idx0 = dyn_cast<ConstantSDNode>(Elt0.getOperand(1));
+ auto *Idx1 = dyn_cast<ConstantSDNode>(Elt1.getOperand(1));
+ // If both indices are dynamic they will be lowered to
+ // loads and the vector will be spilled to local memory. The register
+ // allocator can easily place the results in adjacent registers.
+ if (!Idx0 && !Idx1)
+ return false;
+
+ // If one index is dynamic and the other is constant, the value from the
+ // constant load will result in an additional register to pair with the result
+ // from the dynamic load. We consider this non-coalescable.
+ if ((Idx0 && !Idx1) || (!Idx0 && Idx1))
+ return true;
+
+ // Both are constant, adjacent pairs are coalescable
+ return std::abs(Idx0->getSExtValue() - Idx1->getSExtValue()) != 1;
+}
+
+/// Scalarize a v2f32 arithmetic node (FADD, FMUL, FSUB, FMA) when at least
+/// one operand is a BUILD_VECTOR that repacks values from non-adjacent register
+/// pairs. Without this combine the BUILD_VECTOR forces allocation of a
+/// temporary 64-bit register, increasing register pressure.
+///
+/// Example - before:
+/// t0: v2f32,v2f32,ch = LoadV2 ...
+/// t1: f32 = extract_vector_elt t0, 0
+/// t2: f32 = extract_vector_elt t0:1, 0
+/// t3: v2f32 = BUILD_VECTOR t1, t2 ;; non-coalescable repack
+/// t4: v2f32 = fma t_a, t3, t_c
+///
+/// After:
+/// t0: v2f32,v2f32,ch = LoadV2 ...
+/// t1: f32 = extract_vector_elt t0, 0
+/// t2: f32 = extract_vector_elt t0:1, 0
+/// a0: f32 = extract_vector_elt t_a, 0
+/// a1: f32 = extract_vector_elt t_a, 1
+/// c0: f32 = extract_vector_elt t_c, 0
+/// c1: f32 = extract_vector_elt t_c, 1
+/// r0: f32 = fma a0, t1, c0
+/// r1: f32 = fma a1, t2, c1
+/// t4: v2f32 = BUILD_VECTOR r0, r1
+static SDValue PerformScalarizeV2F32Op(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::v2f32)
+ return SDValue();
+
+ // Only scalarize when at least one operand is a BUILD_VECTOR whose elements
+ // are guaranteed to reside in different register pairs.
+ if (none_of(N->ops(), isNonCoalescableBuildVector))
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+ EVT EltVT = VT.getVectorElementType();
+ unsigned Opc = N->getOpcode();
+
+ // For each operand, get the scalar element at the given index: if the operand
+ // is a BUILD_VECTOR, grab the element directly; otherwise, emit an
+ // EXTRACT_VECTOR_ELT.
+ auto GetElement = [&](SDValue Op, unsigned Index) -> SDValue {
+ if (Op.getOpcode() == ISD::BUILD_VECTOR)
+ return Op.getOperand(Index);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op,
+ DAG.getVectorIdxConstant(Index, DL));
+ };
+
+ // Build scalar operand lists for element 0 and element 1.
+ SmallVector<SDValue, 3> Ops0, Ops1;
+ for (const SDValue &Op : N->ops()) {
+ Ops0.push_back(GetElement(Op, 0));
+ Ops1.push_back(GetElement(Op, 1));
+ }
+
+ SDValue Res0 = DAG.getNode(Opc, DL, EltVT, Ops0, N->getFlags());
+ SDValue Res1 = DAG.getNode(Opc, DL, EltVT, Ops1, N->getFlags());
+
+ return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Res0, Res1);
+}
+
/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
///
static SDValue PerformFADDCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- CodeGenOptLevel OptLevel) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ CodeGenOptLevel OptLevel) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
+ if (SDValue Result = PerformScalarizeV2F32Op(N, DCI))
+ return Result;
+
EVT VT = N0.getValueType();
if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
return SDValue();
@@ -6993,6 +7109,10 @@
return PerformEXTRACTCombine(N, DCI);
case ISD::FADD:
return PerformFADDCombine(N, DCI, OptLevel);
+ case ISD::FMA:
+ case ISD::FMUL:
+ case ISD::FSUB:
+ return PerformScalarizeV2F32Op(N, DCI);
case ISD::FMAXNUM:
case ISD::FMINNUM:
case ISD::FMAXIMUM:
diff --git a/llvm/test/CodeGen/NVPTX/scalarize-non-coalescable-v2f32.ll b/llvm/test/CodeGen/NVPTX/scalarize-non-coalescable-v2f32.ll
new file mode 100644
index 0000000..f953b86
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/scalarize-non-coalescable-v2f32.ll
@@ -0,0 +1,356 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | FileCheck %s
+; RUN: %if ptxas-sm_100 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | %ptxas-verify -arch sm_100 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define <2 x float> @fma_non_coalescable(ptr addrspace(3) %p, <2 x float> %a, <2 x float> %c) {
+; CHECK-LABEL: fma_non_coalescable(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<11>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [fma_non_coalescable_param_0];
+; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [fma_non_coalescable_param_1];
+; CHECK-NEXT: ld.param.v2.b32 {%r7, %r8}, [fma_non_coalescable_param_2];
+; CHECK-NEXT: fma.rn.f32 %r9, %r6, %r3, %r8;
+; CHECK-NEXT: fma.rn.f32 %r10, %r5, %r1, %r7;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r10, %r9};
+; CHECK-NEXT: ret;
+ %ld = load <4 x float>, ptr addrspace(3) %p, align 16
+ %e0 = extractelement <4 x float> %ld, i32 0
+ %e2 = extractelement <4 x float> %ld, i32 2
+ %bv0 = insertelement <2 x float> poison, float %e0, i32 0
+ %bv = insertelement <2 x float> %bv0, float %e2, i32 1
+ %mul = fmul <2 x float> %a, %bv
+ %res = fadd <2 x float> %mul, %c
+ ret <2 x float> %res
+}
+
+define <2 x float> @fma_constant_and_dynamic_index(i32 %idx, ptr addrspace(3) %p, <2 x float> %a, <2 x float> %c) {
+; CHECK-LABEL: fma_constant_and_dynamic_index(
+; CHECK: {
+; CHECK-NEXT: .local .align 8 .b8 __local_depot1[16];
+; CHECK-NEXT: .reg .b64 %SP;
+; CHECK-NEXT: .reg .b64 %SPL;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.b64 %SPL, __local_depot1;
+; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT: ld.param.b32 %rd1, [fma_constant_and_dynamic_index_param_0];
+; CHECK-NEXT: and.b64 %rd2, %rd1, 3;
+; CHECK-NEXT: shl.b64 %rd3, %rd2, 2;
+; CHECK-NEXT: add.u64 %rd4, %SP, 0;
+; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3;
+; CHECK-NEXT: ld.param.b64 %rd6, [fma_constant_and_dynamic_index_param_1];
+; CHECK-NEXT: ld.shared.v2.b64 {%rd7, %rd8}, [%rd6];
+; CHECK-NEXT: mov.b64 {%r1, _}, %rd7;
+; CHECK-NEXT: ld.param.v2.b32 {%r2, %r3}, [fma_constant_and_dynamic_index_param_2];
+; CHECK-NEXT: st.b64 [%SP+8], %rd8;
+; CHECK-NEXT: st.b64 [%SP], %rd7;
+; CHECK-NEXT: ld.b32 %r4, [%rd5];
+; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [fma_constant_and_dynamic_index_param_3];
+; CHECK-NEXT: fma.rn.f32 %r7, %r2, %r1, %r5;
+; CHECK-NEXT: fma.rn.f32 %r8, %r3, %r4, %r6;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r7, %r8};
+; CHECK-NEXT: ret;
+ %ld = load <4 x float>, ptr addrspace(3) %p, align 16
+ %e0 = extractelement <4 x float> %ld, i32 0
+ %e1 = extractelement <4 x float> %ld, i32 %idx
+ %bv0 = insertelement <2 x float> poison, float %e0, i32 0
+ %bv = insertelement <2 x float> %bv0, float %e1, i32 1
+ %mul = fmul <2 x float> %a, %bv
+ %res = fadd <2 x float> %mul, %c
+ ret <2 x float> %res
+}
+
+define <2 x float> @fma_shufflevector_non_coalescable(ptr addrspace(3) %p, <2 x float> %a, <2 x float> %c) {
+; CHECK-LABEL: fma_shufflevector_non_coalescable(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<11>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [fma_shufflevector_non_coalescable_param_0];
+; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [fma_shufflevector_non_coalescable_param_1];
+; CHECK-NEXT: ld.param.v2.b32 {%r7, %r8}, [fma_shufflevector_non_coalescable_param_2];
+; CHECK-NEXT: fma.rn.f32 %r9, %r6, %r3, %r8;
+; CHECK-NEXT: fma.rn.f32 %r10, %r5, %r1, %r7;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r10, %r9};
+; CHECK-NEXT: ret;
+ %ld = load <4 x float>, ptr addrspace(3) %p, align 16
+ %bv = shufflevector <4 x float> %ld, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %mul = fmul <2 x float> %a, %bv
+ %res = fadd <2 x float> %mul, %c
+ ret <2 x float> %res
+}
+
+define <2 x float> @fma_mixed_extract_and_scalar(ptr addrspace(3) %p, float %s, <2 x float> %a, <2 x float> %c) {
+; CHECK-LABEL: fma_mixed_extract_and_scalar(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<12>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [fma_mixed_extract_and_scalar_param_0];
+; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: ld.param.b32 %r5, [fma_mixed_extract_and_scalar_param_1];
+; CHECK-NEXT: ld.param.v2.b32 {%r6, %r7}, [fma_mixed_extract_and_scalar_param_2];
+; CHECK-NEXT: ld.param.v2.b32 {%r8, %r9}, [fma_mixed_extract_and_scalar_param_3];
+; CHECK-NEXT: fma.rn.f32 %r10, %r7, %r5, %r9;
+; CHECK-NEXT: fma.rn.f32 %r11, %r6, %r3, %r8;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r11, %r10};
+; CHECK-NEXT: ret;
+ %ld = load <4 x float>, ptr addrspace(3) %p, align 16
+ %e2 = extractelement <4 x float> %ld, i32 2
+ %bv0 = insertelement <2 x float> poison, float %e2, i32 0
+ %bv = insertelement <2 x float> %bv0, float %s, i32 1
+ %mul = fmul <2 x float> %a, %bv
+ %res = fadd <2 x float> %mul, %c
+ ret <2 x float> %res
+}
+
+define <2 x float> @fadd_non_coalescable(ptr addrspace(3) %p, <2 x float> %a) {
+; CHECK-LABEL: fadd_non_coalescable(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [fadd_non_coalescable_param_0];
+; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [fadd_non_coalescable_param_1];
+; CHECK-NEXT: add.f32 %r7, %r6, %r3;
+; CHECK-NEXT: add.f32 %r8, %r5, %r1;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7};
+; CHECK-NEXT: ret;
+ %ld = load <4 x float>, ptr addrspace(3) %p, align 16
+ %e0 = extractelement <4 x float> %ld, i32 0
+ %e2 = extractelement <4 x float> %ld, i32 2
+ %bv0 = insertelement <2 x float> poison, float %e0, i32 0
+ %bv = insertelement <2 x float> %bv0, float %e2, i32 1
+ %add = fadd <2 x float> %a, %bv
+ ret <2 x float> %add
+}
+
+define <2 x float> @fmul_non_coalescable(ptr addrspace(3) %p, <2 x float> %a) {
+; CHECK-LABEL: fmul_non_coalescable(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [fmul_non_coalescable_param_0];
+; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [fmul_non_coalescable_param_1];
+; CHECK-NEXT: mul.f32 %r7, %r6, %r3;
+; CHECK-NEXT: mul.f32 %r8, %r5, %r1;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7};
+; CHECK-NEXT: ret;
+ %ld = load <4 x float>, ptr addrspace(3) %p, align 16
+ %e0 = extractelement <4 x float> %ld, i32 0
+ %e2 = extractelement <4 x float> %ld, i32 2
+ %bv0 = insertelement <2 x float> poison, float %e0, i32 0
+ %bv = insertelement <2 x float> %bv0, float %e2, i32 1
+ %mul = fmul <2 x float> %a, %bv
+ ret <2 x float> %mul
+}
+
+define <2 x float> @fsub_non_coalescable(ptr addrspace(3) %p, <2 x float> %a) {
+; CHECK-LABEL: fsub_non_coalescable(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [fsub_non_coalescable_param_0];
+; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [fsub_non_coalescable_param_1];
+; CHECK-NEXT: sub.f32 %r7, %r6, %r3;
+; CHECK-NEXT: sub.f32 %r8, %r5, %r1;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7};
+; CHECK-NEXT: ret;
+ %ld = load <4 x float>, ptr addrspace(3) %p, align 16
+ %e0 = extractelement <4 x float> %ld, i32 0
+ %e2 = extractelement <4 x float> %ld, i32 2
+ %bv0 = insertelement <2 x float> poison, float %e0, i32 0
+ %bv = insertelement <2 x float> %bv0, float %e2, i32 1
+ %sub = fsub <2 x float> %a, %bv
+ ret <2 x float> %sub
+}
+
+; The rest of these tests should remain vectorized
+define <4 x float> @fma_adjacent_elements(ptr addrspace(3) %p, <2 x float> %a, <2 x float> %c) {
+; CHECK-LABEL: fma_adjacent_elements(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [fma_adjacent_elements_param_0];
+; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT: ld.param.b64 %rd4, [fma_adjacent_elements_param_1];
+; CHECK-NEXT: ld.param.b64 %rd5, [fma_adjacent_elements_param_2];
+; CHECK-NEXT: fma.rn.f32x2 %rd6, %rd4, %rd2, %rd5;
+; CHECK-NEXT: fma.rn.f32x2 %rd7, %rd4, %rd3, %rd5;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd7};
+; CHECK-NEXT: ret;
+ %ld = load <4 x float>, ptr addrspace(3) %p, align 16
+ %e0 = extractelement <4 x float> %ld, i32 0
+ %e1 = extractelement <4 x float> %ld, i32 1
+ %e2 = extractelement <4 x float> %ld, i32 2
+ %e3 = extractelement <4 x float> %ld, i32 3
+ %lo0 = insertelement <2 x float> poison, float %e0, i32 0
+ %lo = insertelement <2 x float> %lo0, float %e1, i32 1
+ %hi0 = insertelement <2 x float> poison, float %e2, i32 0
+ %hi = insertelement <2 x float> %hi0, float %e3, i32 1
+ %mul_lo = fmul <2 x float> %a, %lo
+ %res_lo = fadd <2 x float> %mul_lo, %c
+ %mul_hi = fmul <2 x float> %a, %hi
+ %res_hi = fadd <2 x float> %mul_hi, %c
+ %out = shufflevector <2 x float> %res_lo, <2 x float> %res_hi, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x float> %out
+}
+
+define <4 x float> @fma_adjacent_swapped_elements(ptr addrspace(3) %p, <2 x float> %a, <2 x float> %c) {
+; CHECK-LABEL: fma_adjacent_swapped_elements(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [fma_adjacent_swapped_elements_param_0];
+; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: ld.param.b64 %rd2, [fma_adjacent_swapped_elements_param_1];
+; CHECK-NEXT: ld.param.b64 %rd3, [fma_adjacent_swapped_elements_param_2];
+; CHECK-NEXT: mov.b64 %rd4, {%r2, %r1};
+; CHECK-NEXT: mov.b64 %rd5, {%r4, %r3};
+; CHECK-NEXT: fma.rn.f32x2 %rd6, %rd2, %rd4, %rd3;
+; CHECK-NEXT: fma.rn.f32x2 %rd7, %rd2, %rd5, %rd3;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd7};
+; CHECK-NEXT: ret;
+ %ld = load <4 x float>, ptr addrspace(3) %p, align 16
+ %e0 = extractelement <4 x float> %ld, i32 0
+ %e1 = extractelement <4 x float> %ld, i32 1
+ %e2 = extractelement <4 x float> %ld, i32 2
+ %e3 = extractelement <4 x float> %ld, i32 3
+ %lo0 = insertelement <2 x float> poison, float %e1, i32 0
+ %lo = insertelement <2 x float> %lo0, float %e0, i32 1
+ %hi0 = insertelement <2 x float> poison, float %e3, i32 0
+ %hi = insertelement <2 x float> %hi0, float %e2, i32 1
+ %mul_lo = fmul <2 x float> %a, %lo
+ %res_lo = fadd <2 x float> %mul_lo, %c
+ %mul_hi = fmul <2 x float> %a, %hi
+ %res_hi = fadd <2 x float> %mul_hi, %c
+ %out = shufflevector <2 x float> %res_lo, <2 x float> %res_hi, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x float> %out
+}
+
+define <2 x float> @fma_both_dynamic_indices(i32 %i0, i32 %i1, ptr addrspace(3) %p, <2 x float> %a, <2 x float> %c) {
+; CHECK-LABEL: fma_both_dynamic_indices(
+; CHECK: {
+; CHECK-NEXT: .local .align 8 .b8 __local_depot9[32];
+; CHECK-NEXT: .reg .b64 %SP;
+; CHECK-NEXT: .reg .b64 %SPL;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<18>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.b64 %SPL, __local_depot9;
+; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT: ld.param.b32 %rd1, [fma_both_dynamic_indices_param_0];
+; CHECK-NEXT: and.b64 %rd2, %rd1, 3;
+; CHECK-NEXT: shl.b64 %rd3, %rd2, 2;
+; CHECK-NEXT: add.u64 %rd4, %SP, 16;
+; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3;
+; CHECK-NEXT: ld.param.b32 %rd6, [fma_both_dynamic_indices_param_1];
+; CHECK-NEXT: and.b64 %rd7, %rd6, 3;
+; CHECK-NEXT: shl.b64 %rd8, %rd7, 2;
+; CHECK-NEXT: add.u64 %rd9, %SP, 0;
+; CHECK-NEXT: add.s64 %rd10, %rd9, %rd8;
+; CHECK-NEXT: ld.param.b64 %rd11, [fma_both_dynamic_indices_param_2];
+; CHECK-NEXT: ld.shared.v2.b64 {%rd12, %rd13}, [%rd11];
+; CHECK-NEXT: st.b64 [%SP+24], %rd13;
+; CHECK-NEXT: st.b64 [%SP+16], %rd12;
+; CHECK-NEXT: ld.b32 %r1, [%rd5];
+; CHECK-NEXT: st.b64 [%SP+8], %rd13;
+; CHECK-NEXT: st.b64 [%SP], %rd12;
+; CHECK-NEXT: ld.b32 %r2, [%rd10];
+; CHECK-NEXT: ld.param.b64 %rd14, [fma_both_dynamic_indices_param_3];
+; CHECK-NEXT: ld.param.b64 %rd15, [fma_both_dynamic_indices_param_4];
+; CHECK-NEXT: mov.b64 %rd16, {%r1, %r2};
+; CHECK-NEXT: fma.rn.f32x2 %rd17, %rd14, %rd16, %rd15;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd17;
+; CHECK-NEXT: ret;
+ %ld = load <4 x float>, ptr addrspace(3) %p, align 16
+ %e0 = extractelement <4 x float> %ld, i32 %i0
+ %e1 = extractelement <4 x float> %ld, i32 %i1
+ %bv0 = insertelement <2 x float> poison, float %e0, i32 0
+ %bv = insertelement <2 x float> %bv0, float %e1, i32 1
+ %mul = fmul <2 x float> %a, %bv
+ %res = fadd <2 x float> %mul, %c
+ ret <2 x float> %res
+}
+
+define <2 x float> @fma_shufflevector_adjacent_elements(ptr addrspace(3) %p, <2 x float> %a, <2 x float> %c) {
+; CHECK-LABEL: fma_shufflevector_adjacent_elements(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [fma_shufflevector_adjacent_elements_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [fma_shufflevector_adjacent_elements_param_1];
+; CHECK-NEXT: ld.shared.b64 %rd3, [%rd1];
+; CHECK-NEXT: ld.param.b64 %rd4, [fma_shufflevector_adjacent_elements_param_2];
+; CHECK-NEXT: fma.rn.f32x2 %rd5, %rd2, %rd3, %rd4;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd5;
+; CHECK-NEXT: ret;
+ %ld = load <4 x float>, ptr addrspace(3) %p, align 16
+ %bv = shufflevector <4 x float> %ld, <4 x float> poison, <2 x i32> <i32 0, i32 1>
+ %mul = fmul <2 x float> %a, %bv
+ %res = fadd <2 x float> %mul, %c
+ ret <2 x float> %res
+}
+
+define <2 x float> @fma_naturally_paired(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; CHECK-LABEL: fma_naturally_paired(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [fma_naturally_paired_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [fma_naturally_paired_param_1];
+; CHECK-NEXT: ld.param.b64 %rd3, [fma_naturally_paired_param_2];
+; CHECK-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd4;
+; CHECK-NEXT: ret;
+ %mul = fmul <2 x float> %a, %b
+ %res = fadd <2 x float> %mul, %c
+ ret <2 x float> %res
+}
+
+define <2 x float> @fma_coalescable_scalars(float %x, float %y, <2 x float> %b, <2 x float> %c) {
+; CHECK-LABEL: fma_coalescable_scalars(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [fma_coalescable_scalars_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [fma_coalescable_scalars_param_1];
+; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2};
+; CHECK-NEXT: ld.param.b64 %rd2, [fma_coalescable_scalars_param_2];
+; CHECK-NEXT: ld.param.b64 %rd3, [fma_coalescable_scalars_param_3];
+; CHECK-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd4;
+; CHECK-NEXT: ret;
+ %v0 = insertelement <2 x float> poison, float %x, i32 0
+ %v = insertelement <2 x float> %v0, float %y, i32 1
+ %mul = fmul <2 x float> %v, %b
+ %res = fadd <2 x float> %mul, %c
+ ret <2 x float> %res
+}