clang/test/CodeGenCUDA/fp-contract.cu - llvm-project - Git at Google

 // REQUIRES: x86-registered-target, nvptx-registered-target, amdgpu-registered-target

 // By default CUDA uses -ffp-contract=fast, HIP uses -ffp-contract=fast-honor-pragmas.
 // we should fuse multiply/add into fma instruction.
 // In IR, fmul/fadd instructions with contract flag are emitted.
 // In backend
 //    nvptx -  assumes fast fp fuse option, which fuses
 //             mult/add insts disregarding contract flag and
 //             llvm.fmuladd intrinsics.
 //    amdgcn - assumes standard fp fuse option, which only
 //             fuses mult/add insts with contract flag and
 //             llvm.fmuladd intrinsics.

 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 // RUN:   -disable-llvm-passes -o - %s \
 // RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
 // RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 // RUN:   -O3 -o - %s \
 // RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-FAST %s
 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
 // RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s

 // Check separate compile/backend steps corresponding to -save-temps.

 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
 // RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
 // RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
 // RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s

 // Explicit -ffp-contract=fast
 // In IR, fmul/fadd instructions with contract flag are emitted.
 // In backend
 //    nvptx/amdgcn - assumes fast fp fuse option, which fuses
 //                   mult/add insts disregarding contract flag and
 //                   llvm.fmuladd intrinsics.

 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 // RUN:   -ffp-contract=fast -disable-llvm-passes -o - %s \
 // RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
 // RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
 // RUN:   -ffp-contract=fast \
 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 // RUN:   -O3 -o - %s \
 // RUN:   -ffp-contract=fast \
 // RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-FAST %s
 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
 // RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
 // RUN:   -ffp-contract=fast \
 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST %s

 // Check separate compile/backend steps corresponding to -save-temps.
 // When input is IR, -ffp-contract has no effect. Backend uses default
 // default FP fuse option.

 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
 // RUN:   -ffp-contract=fast \
 // RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
 // RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
 // RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s

 // Explicit -ffp-contract=fast-honor-pragmas
 // In IR, fmul/fadd instructions with contract flag are emitted.
 // In backend
 //    nvptx/amdgcn - assumes standard fp fuse option, which only
 //                   fuses mult/add insts with contract flag or
 //                   llvm.fmuladd intrinsics.

 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 // RUN:   -ffp-contract=fast-honor-pragmas -disable-llvm-passes -o - %s \
 // RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
 // RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
 // RUN:   -ffp-contract=fast-honor-pragmas \
 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 // RUN:   -O3 -o - %s \
 // RUN:   -ffp-contract=fast-honor-pragmas \
 // RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-FASTSTD %s
 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
 // RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
 // RUN:   -ffp-contract=fast-honor-pragmas \
 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s

 // Check separate compile/backend steps corresponding to -save-temps.
 // When input is IR, -ffp-contract has no effect. Backend uses default
 // default FP fuse option.

 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
 // RUN:   -ffp-contract=fast-honor-pragmas \
 // RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
 // RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
 // RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s

 // Explicit -ffp-contract=on -- fusing by front-end.
 // In IR,
 //    mult/add in the same statement - llvm.fmuladd instrinsic emitted
 //    mult/add in different statement -  fmul/fadd instructions without
 //                                       contract flag are emitted.
 // In backend
 //    nvptx/amdgcn - assumes standard fp fuse option, which only
 //                   fuses mult/add insts with contract flag or
 //                   llvm.fmuladd intrinsics.

 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 // RUN:   -ffp-contract=on -disable-llvm-passes -o - %s \
 // RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
 // RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
 // RUN:   -ffp-contract=on \
 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 // RUN:   -O3 -o - %s \
 // RUN:   -ffp-contract=on \
 // RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-ON %s
 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
 // RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
 // RUN:   -ffp-contract=on \
 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-ON %s

 // Check separate compile/backend steps corresponding to -save-temps.

 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
 // RUN:   -ffp-contract=on \
 // RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
 // RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-ON-IR %s
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
 // RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-ON %s

 // Explicit -ffp-contract=off should disable instruction fusing.
 // In IR, fmul/fadd instructions without contract flag are emitted.
 // In backend
 //    nvptx/amdgcn - assumes standard fp fuse option, which only
 //                   fuses mult/add insts with contract flag or
 //                   llvm.fmuladd intrinsics.

 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 // RUN:   -ffp-contract=off -disable-llvm-passes -o - %s \
 // RUN:   | FileCheck -check-prefixes=COMMON,NV-OFF %s
 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
 // RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
 // RUN:   -ffp-contract=off \
 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OFF %s
 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 // RUN:   -O3 -o - %s \
 // RUN:   -ffp-contract=off \
 // RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-OFF %s
 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
 // RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
 // RUN:   -ffp-contract=off \
 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF %s

 // Check separate compile/backend steps corresponding to -save-temps.

 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
 // RUN:   -ffp-contract=off \
 // RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
 // RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF-IR %s
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
 // RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF %s

 #include "Inputs/cuda.h"

 // Test multiply/add in the same statement, which can be emitted as FMA when
 // fp-contract is on or fast.
 __host__ __device__ float func(float a, float b, float c) { return a + b * c; }
 // COMMON-LABEL: _Z4funcfff
 // NV-ON:       fma.rn.f32
 // NV-ON-NEXT:  st.param.f32
 // AMD-ON:       v_fmac_f32_e64
 // AMD-ON-NEXT:  s_setpc_b64

 // NV-OFF:      mul.rn.f32
 // NV-OFF-NEXT: add.rn.f32
 // NV-OFF-NEXT: st.param.f32
 // AMD-OFF:      v_mul_f32_e64
 // AMD-OFF-NEXT: v_add_f32_e64
 // AMD-OFF-NEXT: s_setpc_b64

 // NV-OPT-FAST: fma.rn.f32
 // NV-OPT-FAST-NEXT: st.param.f32
 // NV-OPT-FASTSTD: fma.rn.f32
 // NV-OPT-FASTSTD-NEXT: st.param.f32
 // NV-OPT-ON: fma.rn.f32
 // NV-OPT-ON-NEXT: st.param.f32
 // NV-OPT-OFF: mul.rn.f32
 // NV-OPT-OFF-NEXT: add.rn.f32
 // NV-OPT-OFF-NEXT: st.param.f32

 // AMD-OPT-FAST-IR: fmul contract float
 // AMD-OPT-FAST-IR: fadd contract float
 // AMD-OPT-ON-IR: @llvm.fmuladd.f32
 // AMD-OPT-OFF-IR: fmul float
 // AMD-OPT-OFF-IR: fadd float

 // AMD-OPT-FAST: v_fmac_f32_e32
 // AMD-OPT-FAST-NEXT: s_setpc_b64
 // AMD-OPT-FASTSTD: v_fmac_f32_e32
 // AMD-OPT-FASTSTD-NEXT: s_setpc_b64
 // AMD-OPT-ON: v_fmac_f32_e32
 // AMD-OPT-ON-NEXT: s_setpc_b64
 // AMD-OPT-OFF: v_mul_f32_e32
 // AMD-OPT-OFF-NEXT: v_add_f32_e32
 // AMD-OPT-OFF-NEXT: s_setpc_b64

 // Test multiply/add in the different statements, which can be emitted as
 // FMA when fp-contract is fast but not on.
 __host__ __device__ float func2(float a, float b, float c) {
   float t = b * c;
   return t + a;
 }
 // COMMON-LABEL: _Z5func2fff
 // NV-OPT-FAST: fma.rn.f32
 // NV-OPT-FAST-NEXT: st.param.f32
 // NV-OPT-FASTSTD: fma.rn.f32
 // NV-OPT-FASTSTD-NEXT: st.param.f32
 // NV-OPT-ON: mul.rn.f32
 // NV-OPT-ON: add.rn.f32
 // NV-OPT-ON-NEXT: st.param.f32
 // NV-OPT-OFF: mul.rn.f32
 // NV-OPT-OFF: add.rn.f32
 // NV-OPT-OFF-NEXT: st.param.f32

 // AMD-OPT-FAST-IR: fmul contract float
 // AMD-OPT-FAST-IR: fadd contract float
 // AMD-OPT-ON-IR: fmul float
 // AMD-OPT-ON-IR: fadd float
 // AMD-OPT-OFF-IR: fmul float
 // AMD-OPT-OFF-IR: fadd float

 // AMD-OPT-FAST: v_fmac_f32_e32
 // AMD-OPT-FAST-NEXT: s_setpc_b64
 // AMD-OPT-FASTSTD: v_fmac_f32_e32
 // AMD-OPT-FASTSTD-NEXT: s_setpc_b64
 // AMD-OPT-ON: v_mul_f32_e32
 // AMD-OPT-ON-NEXT: v_add_f32_e32
 // AMD-OPT-ON-NEXT: s_setpc_b64
 // AMD-OPT-OFF: v_mul_f32_e32
 // AMD-OPT-OFF-NEXT: v_add_f32_e32
 // AMD-OPT-OFF-NEXT: s_setpc_b64

 // Test multiply/add in the different statements, which is forced
 // to be compiled with fp contract on. fmul/fadd without contract
 // flags are emitted in IR. In nvptx, they are emitted as FMA in
 // fp-contract is fast but not on, as nvptx backend uses the same
 // fp fuse option as front end, whereas fast fp fuse option in
 // backend fuses fadd/fmul disregarding contract flag. In amdgcn
 // they are not fused as amdgcn always use standard fp fusion
 // option which respects contract flag.
   __host__ __device__ float func3(float a, float b, float c) {
 #pragma clang fp contract(on)
   float t = b * c;
   return t + a;
 }
 // COMMON-LABEL: _Z5func3fff
 // NV-OPT-FAST: fma.rn.f32
 // NV-OPT-FAST-NEXT: st.param.f32
 // NV-OPT-FASTSTD: mul.rn.f32
 // NV-OPT-FASTSTD: add.rn.f32
 // NV-OPT-FASTSTD-NEXT: st.param.f32
 // NV-OPT-ON: mul.rn.f32
 // NV-OPT-ON: add.rn.f32
 // NV-OPT-ON-NEXT: st.param.f32
 // NV-OPT-OFF: mul.rn.f32
 // NV-OPT-OFF: add.rn.f32
 // NV-OPT-OFF-NEXT: st.param.f32

 // AMD-OPT-FAST-IR: fmul float
 // AMD-OPT-FAST-IR: fadd float
 // AMD-OPT-ON-IR: fmul float
 // AMD-OPT-ON-IR: fadd float
 // AMD-OPT-OFF-IR: fmul float
 // AMD-OPT-OFF-IR: fadd float

 // AMD-OPT-FAST: v_fmac_f32_e32
 // AMD-OPT-FAST-NEXT: s_setpc_b64
 // AMD-OPT-FASTSTD: v_mul_f32_e32
 // AMD-OPT-FASTSTD-NEXT: v_add_f32_e32
 // AMD-OPT-FASTSTD-NEXT: s_setpc_b64
 // AMD-OPT-ON: v_mul_f32_e32
 // AMD-OPT-ON-NEXT: v_add_f32_e32
 // AMD-OPT-ON-NEXT: s_setpc_b64
 // AMD-OPT-OFF: v_mul_f32_e32
 // AMD-OPT-OFF-NEXT: v_add_f32_e32
 // AMD-OPT-OFF-NEXT: s_setpc_b64
	// REQUIRES: x86-registered-target, nvptx-registered-target, amdgpu-registered-target

	// By default CUDA uses -ffp-contract=fast, HIP uses -ffp-contract=fast-honor-pragmas.
	// we should fuse multiply/add into fma instruction.
	// In IR, fmul/fadd instructions with contract flag are emitted.
	// In backend
	// nvptx - assumes fast fp fuse option, which fuses
	// mult/add insts disregarding contract flag and
	// llvm.fmuladd intrinsics.
	// amdgcn - assumes standard fp fuse option, which only
	// fuses mult/add insts with contract flag and
	// llvm.fmuladd intrinsics.

	// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
	// RUN: -disable-llvm-passes -o - %s \
	// RUN: \| FileCheck -check-prefixes=COMMON,NV-ON %s
	// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
	// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
	// RUN: \| FileCheck -check-prefixes=COMMON,AMD-ON %s
	// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
	// RUN: -O3 -o - %s \
	// RUN: \| FileCheck -check-prefixes=COMMON,NV-OPT-FAST %s
	// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
	// RUN: -O3 -target-cpu gfx906 -o - -x hip %s \
	// RUN: \| FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s

	// Check separate compile/backend steps corresponding to -save-temps.

	// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
	// RUN: -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
	// RUN: cat %t.ll \| FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s
	// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
	// RUN: -O3 -target-cpu gfx906 -o - -x ir %t.ll \
	// RUN: \| FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s

	// Explicit -ffp-contract=fast
	// In IR, fmul/fadd instructions with contract flag are emitted.
	// In backend
	// nvptx/amdgcn - assumes fast fp fuse option, which fuses
	// mult/add insts disregarding contract flag and
	// llvm.fmuladd intrinsics.

	// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
	// RUN: -ffp-contract=fast -disable-llvm-passes -o - %s \
	// RUN: \| FileCheck -check-prefixes=COMMON,NV-ON %s
	// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
	// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
	// RUN: -ffp-contract=fast \
	// RUN: \| FileCheck -check-prefixes=COMMON,AMD-ON %s
	// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
	// RUN: -O3 -o - %s \
	// RUN: -ffp-contract=fast \
	// RUN: \| FileCheck -check-prefixes=COMMON,NV-OPT-FAST %s
	// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
	// RUN: -O3 -target-cpu gfx906 -o - -x hip %s \
	// RUN: -ffp-contract=fast \
	// RUN: \| FileCheck -check-prefixes=COMMON,AMD-OPT-FAST %s

	// Check separate compile/backend steps corresponding to -save-temps.
	// When input is IR, -ffp-contract has no effect. Backend uses default
	// default FP fuse option.

	// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
	// RUN: -ffp-contract=fast \
	// RUN: -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
	// RUN: cat %t.ll \| FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s
	// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
	// RUN: -O3 -target-cpu gfx906 -o - -x ir %t.ll \
	// RUN: \| FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s

	// Explicit -ffp-contract=fast-honor-pragmas
	// In IR, fmul/fadd instructions with contract flag are emitted.
	// In backend
	// nvptx/amdgcn - assumes standard fp fuse option, which only
	// fuses mult/add insts with contract flag or
	// llvm.fmuladd intrinsics.

	// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
	// RUN: -ffp-contract=fast-honor-pragmas -disable-llvm-passes -o - %s \
	// RUN: \| FileCheck -check-prefixes=COMMON,NV-ON %s
	// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
	// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
	// RUN: -ffp-contract=fast-honor-pragmas \
	// RUN: \| FileCheck -check-prefixes=COMMON,AMD-ON %s
	// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
	// RUN: -O3 -o - %s \
	// RUN: -ffp-contract=fast-honor-pragmas \
	// RUN: \| FileCheck -check-prefixes=COMMON,NV-OPT-FASTSTD %s
	// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
	// RUN: -O3 -target-cpu gfx906 -o - -x hip %s \
	// RUN: -ffp-contract=fast-honor-pragmas \
	// RUN: \| FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s

	// Check separate compile/backend steps corresponding to -save-temps.
	// When input is IR, -ffp-contract has no effect. Backend uses default
	// default FP fuse option.

	// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
	// RUN: -ffp-contract=fast-honor-pragmas \
	// RUN: -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
	// RUN: cat %t.ll \| FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s
	// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
	// RUN: -O3 -target-cpu gfx906 -o - -x ir %t.ll \
	// RUN: \| FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s

	// Explicit -ffp-contract=on -- fusing by front-end.
	// In IR,
	// mult/add in the same statement - llvm.fmuladd instrinsic emitted
	// mult/add in different statement - fmul/fadd instructions without
	// contract flag are emitted.
	// In backend
	// nvptx/amdgcn - assumes standard fp fuse option, which only
	// fuses mult/add insts with contract flag or
	// llvm.fmuladd intrinsics.

	// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
	// RUN: -ffp-contract=on -disable-llvm-passes -o - %s \
	// RUN: \| FileCheck -check-prefixes=COMMON,NV-ON %s
	// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
	// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
	// RUN: -ffp-contract=on \
	// RUN: \| FileCheck -check-prefixes=COMMON,AMD-ON %s
	// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
	// RUN: -O3 -o - %s \
	// RUN: -ffp-contract=on \
	// RUN: \| FileCheck -check-prefixes=COMMON,NV-OPT-ON %s
	// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
	// RUN: -O3 -target-cpu gfx906 -o - -x hip %s \
	// RUN: -ffp-contract=on \
	// RUN: \| FileCheck -check-prefixes=COMMON,AMD-OPT-ON %s

	// Check separate compile/backend steps corresponding to -save-temps.

	// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
	// RUN: -ffp-contract=on \
	// RUN: -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
	// RUN: cat %t.ll \| FileCheck -check-prefixes=COMMON,AMD-OPT-ON-IR %s
	// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
	// RUN: -O3 -target-cpu gfx906 -o - -x ir %t.ll \
	// RUN: \| FileCheck -check-prefixes=COMMON,AMD-OPT-ON %s

	// Explicit -ffp-contract=off should disable instruction fusing.
	// In IR, fmul/fadd instructions without contract flag are emitted.
	// In backend
	// nvptx/amdgcn - assumes standard fp fuse option, which only
	// fuses mult/add insts with contract flag or
	// llvm.fmuladd intrinsics.

	// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
	// RUN: -ffp-contract=off -disable-llvm-passes -o - %s \
	// RUN: \| FileCheck -check-prefixes=COMMON,NV-OFF %s
	// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
	// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
	// RUN: -ffp-contract=off \
	// RUN: \| FileCheck -check-prefixes=COMMON,AMD-OFF %s
	// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
	// RUN: -O3 -o - %s \
	// RUN: -ffp-contract=off \
	// RUN: \| FileCheck -check-prefixes=COMMON,NV-OPT-OFF %s
	// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
	// RUN: -O3 -target-cpu gfx906 -o - -x hip %s \
	// RUN: -ffp-contract=off \
	// RUN: \| FileCheck -check-prefixes=COMMON,AMD-OPT-OFF %s

	// Check separate compile/backend steps corresponding to -save-temps.

	// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
	// RUN: -ffp-contract=off \
	// RUN: -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
	// RUN: cat %t.ll \| FileCheck -check-prefixes=COMMON,AMD-OPT-OFF-IR %s
	// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
	// RUN: -O3 -target-cpu gfx906 -o - -x ir %t.ll \
	// RUN: \| FileCheck -check-prefixes=COMMON,AMD-OPT-OFF %s

	#include "Inputs/cuda.h"

	// Test multiply/add in the same statement, which can be emitted as FMA when
	// fp-contract is on or fast.
	__host__ __device__ float func(float a, float b, float c) { return a + b * c; }
	// COMMON-LABEL: _Z4funcfff
	// NV-ON: fma.rn.f32
	// NV-ON-NEXT: st.param.f32
	// AMD-ON: v_fmac_f32_e64
	// AMD-ON-NEXT: s_setpc_b64

	// NV-OFF: mul.rn.f32
	// NV-OFF-NEXT: add.rn.f32
	// NV-OFF-NEXT: st.param.f32
	// AMD-OFF: v_mul_f32_e64
	// AMD-OFF-NEXT: v_add_f32_e64
	// AMD-OFF-NEXT: s_setpc_b64

	// NV-OPT-FAST: fma.rn.f32
	// NV-OPT-FAST-NEXT: st.param.f32
	// NV-OPT-FASTSTD: fma.rn.f32
	// NV-OPT-FASTSTD-NEXT: st.param.f32
	// NV-OPT-ON: fma.rn.f32
	// NV-OPT-ON-NEXT: st.param.f32
	// NV-OPT-OFF: mul.rn.f32
	// NV-OPT-OFF-NEXT: add.rn.f32
	// NV-OPT-OFF-NEXT: st.param.f32

	// AMD-OPT-FAST-IR: fmul contract float
	// AMD-OPT-FAST-IR: fadd contract float
	// AMD-OPT-ON-IR: @llvm.fmuladd.f32
	// AMD-OPT-OFF-IR: fmul float
	// AMD-OPT-OFF-IR: fadd float

	// AMD-OPT-FAST: v_fmac_f32_e32
	// AMD-OPT-FAST-NEXT: s_setpc_b64
	// AMD-OPT-FASTSTD: v_fmac_f32_e32
	// AMD-OPT-FASTSTD-NEXT: s_setpc_b64
	// AMD-OPT-ON: v_fmac_f32_e32
	// AMD-OPT-ON-NEXT: s_setpc_b64
	// AMD-OPT-OFF: v_mul_f32_e32
	// AMD-OPT-OFF-NEXT: v_add_f32_e32
	// AMD-OPT-OFF-NEXT: s_setpc_b64

	// Test multiply/add in the different statements, which can be emitted as
	// FMA when fp-contract is fast but not on.
	__host__ __device__ float func2(float a, float b, float c) {
	float t = b * c;
	return t + a;
	}
	// COMMON-LABEL: _Z5func2fff
	// NV-OPT-FAST: fma.rn.f32
	// NV-OPT-FAST-NEXT: st.param.f32
	// NV-OPT-FASTSTD: fma.rn.f32
	// NV-OPT-FASTSTD-NEXT: st.param.f32
	// NV-OPT-ON: mul.rn.f32
	// NV-OPT-ON: add.rn.f32
	// NV-OPT-ON-NEXT: st.param.f32
	// NV-OPT-OFF: mul.rn.f32
	// NV-OPT-OFF: add.rn.f32
	// NV-OPT-OFF-NEXT: st.param.f32

	// AMD-OPT-FAST-IR: fmul contract float
	// AMD-OPT-FAST-IR: fadd contract float
	// AMD-OPT-ON-IR: fmul float
	// AMD-OPT-ON-IR: fadd float
	// AMD-OPT-OFF-IR: fmul float
	// AMD-OPT-OFF-IR: fadd float

	// AMD-OPT-FAST: v_fmac_f32_e32
	// AMD-OPT-FAST-NEXT: s_setpc_b64
	// AMD-OPT-FASTSTD: v_fmac_f32_e32
	// AMD-OPT-FASTSTD-NEXT: s_setpc_b64
	// AMD-OPT-ON: v_mul_f32_e32
	// AMD-OPT-ON-NEXT: v_add_f32_e32
	// AMD-OPT-ON-NEXT: s_setpc_b64
	// AMD-OPT-OFF: v_mul_f32_e32
	// AMD-OPT-OFF-NEXT: v_add_f32_e32
	// AMD-OPT-OFF-NEXT: s_setpc_b64

	// Test multiply/add in the different statements, which is forced
	// to be compiled with fp contract on. fmul/fadd without contract
	// flags are emitted in IR. In nvptx, they are emitted as FMA in
	// fp-contract is fast but not on, as nvptx backend uses the same
	// fp fuse option as front end, whereas fast fp fuse option in
	// backend fuses fadd/fmul disregarding contract flag. In amdgcn
	// they are not fused as amdgcn always use standard fp fusion
	// option which respects contract flag.
	__host__ __device__ float func3(float a, float b, float c) {
	#pragma clang fp contract(on)
	float t = b * c;
	return t + a;
	}
	// COMMON-LABEL: _Z5func3fff
	// NV-OPT-FAST: fma.rn.f32
	// NV-OPT-FAST-NEXT: st.param.f32
	// NV-OPT-FASTSTD: mul.rn.f32
	// NV-OPT-FASTSTD: add.rn.f32
	// NV-OPT-FASTSTD-NEXT: st.param.f32
	// NV-OPT-ON: mul.rn.f32
	// NV-OPT-ON: add.rn.f32
	// NV-OPT-ON-NEXT: st.param.f32
	// NV-OPT-OFF: mul.rn.f32
	// NV-OPT-OFF: add.rn.f32
	// NV-OPT-OFF-NEXT: st.param.f32

	// AMD-OPT-FAST-IR: fmul float
	// AMD-OPT-FAST-IR: fadd float
	// AMD-OPT-ON-IR: fmul float
	// AMD-OPT-ON-IR: fadd float
	// AMD-OPT-OFF-IR: fmul float
	// AMD-OPT-OFF-IR: fadd float

	// AMD-OPT-FAST: v_fmac_f32_e32
	// AMD-OPT-FAST-NEXT: s_setpc_b64
	// AMD-OPT-FASTSTD: v_mul_f32_e32
	// AMD-OPT-FASTSTD-NEXT: v_add_f32_e32
	// AMD-OPT-FASTSTD-NEXT: s_setpc_b64
	// AMD-OPT-ON: v_mul_f32_e32
	// AMD-OPT-ON-NEXT: v_add_f32_e32
	// AMD-OPT-ON-NEXT: s_setpc_b64
	// AMD-OPT-OFF: v_mul_f32_e32
	// AMD-OPT-OFF-NEXT: v_add_f32_e32
	// AMD-OPT-OFF-NEXT: s_setpc_b64