llvm/test/CodeGen/NVPTX/fp-contract.ll - llvm-project - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,FAST
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefixes=CHECK,DEFAULT
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -fp-contract=fast | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 | %ptxas-verify %}

 target triple = "nvptx64-unknown-cuda"

 ;; Make sure we are generating proper instruction sequences for fused ops
 ;; If fusion is allowed, we try to form fma.rn at the PTX level, and emit
 ;; add.f32 otherwise.  Without an explicit rounding mode on add.f32, ptxas
 ;; is free to fuse with a multiply if it is able.  If fusion is not allowed,
 ;; we do not form fma.rn at the PTX level and explicitly generate add.rn
 ;; for all adds to prevent ptxas from fusion the ops.
 define float @t0(float %a, float %b, float %c) {
 ; FAST-LABEL: t0(
 ; FAST:       {
 ; FAST-NEXT:    .reg .b32 %r<5>;
 ; FAST-EMPTY:
 ; FAST-NEXT:  // %bb.0:
 ; FAST-NEXT:    ld.param.b32 %r1, [t0_param_0];
 ; FAST-NEXT:    ld.param.b32 %r2, [t0_param_1];
 ; FAST-NEXT:    ld.param.b32 %r3, [t0_param_2];
 ; FAST-NEXT:    fma.rn.f32 %r4, %r1, %r2, %r3;
 ; FAST-NEXT:    st.param.b32 [func_retval0], %r4;
 ; FAST-NEXT:    ret;
 ;
 ; DEFAULT-LABEL: t0(
 ; DEFAULT:       {
 ; DEFAULT-NEXT:    .reg .b32 %r<6>;
 ; DEFAULT-EMPTY:
 ; DEFAULT-NEXT:  // %bb.0:
 ; DEFAULT-NEXT:    ld.param.b32 %r1, [t0_param_0];
 ; DEFAULT-NEXT:    ld.param.b32 %r2, [t0_param_1];
 ; DEFAULT-NEXT:    mul.rn.f32 %r3, %r1, %r2;
 ; DEFAULT-NEXT:    ld.param.b32 %r4, [t0_param_2];
 ; DEFAULT-NEXT:    add.rn.f32 %r5, %r3, %r4;
 ; DEFAULT-NEXT:    st.param.b32 [func_retval0], %r5;
 ; DEFAULT-NEXT:    ret;
   %v0 = fmul float %a, %b
   %v1 = fadd float %v0, %c
   ret float %v1
 }

 ;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32
 ;; to prevent ptxas from fusing this with anything else.
 define float @t1(float %a, float %b) {
 ; FAST-LABEL: t1(
 ; FAST:       {
 ; FAST-NEXT:    .reg .b32 %r<6>;
 ; FAST-EMPTY:
 ; FAST-NEXT:  // %bb.0:
 ; FAST-NEXT:    ld.param.b32 %r1, [t1_param_0];
 ; FAST-NEXT:    ld.param.b32 %r2, [t1_param_1];
 ; FAST-NEXT:    add.f32 %r3, %r1, %r2;
 ; FAST-NEXT:    sub.f32 %r4, %r1, %r2;
 ; FAST-NEXT:    mul.f32 %r5, %r3, %r4;
 ; FAST-NEXT:    st.param.b32 [func_retval0], %r5;
 ; FAST-NEXT:    ret;
 ;
 ; DEFAULT-LABEL: t1(
 ; DEFAULT:       {
 ; DEFAULT-NEXT:    .reg .b32 %r<6>;
 ; DEFAULT-EMPTY:
 ; DEFAULT-NEXT:  // %bb.0:
 ; DEFAULT-NEXT:    ld.param.b32 %r1, [t1_param_0];
 ; DEFAULT-NEXT:    ld.param.b32 %r2, [t1_param_1];
 ; DEFAULT-NEXT:    add.rn.f32 %r3, %r1, %r2;
 ; DEFAULT-NEXT:    sub.rn.f32 %r4, %r1, %r2;
 ; DEFAULT-NEXT:    mul.rn.f32 %r5, %r3, %r4;
 ; DEFAULT-NEXT:    st.param.b32 [func_retval0], %r5;
 ; DEFAULT-NEXT:    ret;
   %v1 = fadd float %a, %b
   %v2 = fsub float %a, %b
   %v3 = fmul float %v1, %v2
   ret float %v3
 }

 ;; Make sure we generate the non ".rn" version when the "contract" flag is
 ;; present on the instructions
 define float @t2(float %a, float %b) {
 ; CHECK-LABEL: t2(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [t2_param_0];
 ; CHECK-NEXT:    ld.param.b32 %r2, [t2_param_1];
 ; CHECK-NEXT:    add.f32 %r3, %r1, %r2;
 ; CHECK-NEXT:    sub.f32 %r4, %r1, %r2;
 ; CHECK-NEXT:    mul.f32 %r5, %r3, %r4;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
 ; CHECK-NEXT:    ret;
   %v1 = fadd contract float %a, %b
   %v2 = fsub contract float %a, %b
   %v3 = fmul contract float %v1, %v2
   ret float %v3
 }

 ;; Make sure we always fold to fma when the "contract" flag is present
 define float @t3(float %a, float %b, float %c) {
 ; CHECK-LABEL: t3(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [t3_param_0];
 ; CHECK-NEXT:    ld.param.b32 %r2, [t3_param_1];
 ; CHECK-NEXT:    ld.param.b32 %r3, [t3_param_2];
 ; CHECK-NEXT:    fma.rn.f32 %r4, %r1, %r2, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
   %v0 = fmul contract float %a, %b
   %v1 = fadd contract float %v0, %c
   ret float %v1
 }
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
	; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -fp-contract=fast \| FileCheck %s --check-prefixes=CHECK,FAST
	; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 \| FileCheck %s --check-prefixes=CHECK,DEFAULT
	; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -fp-contract=fast \| %ptxas-verify %}
	; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 \| %ptxas-verify %}

	target triple = "nvptx64-unknown-cuda"

	;; Make sure we are generating proper instruction sequences for fused ops
	;; If fusion is allowed, we try to form fma.rn at the PTX level, and emit
	;; add.f32 otherwise. Without an explicit rounding mode on add.f32, ptxas
	;; is free to fuse with a multiply if it is able. If fusion is not allowed,
	;; we do not form fma.rn at the PTX level and explicitly generate add.rn
	;; for all adds to prevent ptxas from fusion the ops.
	define float @t0(float %a, float %b, float %c) {
	; FAST-LABEL: t0(
	; FAST: {
	; FAST-NEXT: .reg .b32 %r<5>;
	; FAST-EMPTY:
	; FAST-NEXT: // %bb.0:
	; FAST-NEXT: ld.param.b32 %r1, [t0_param_0];
	; FAST-NEXT: ld.param.b32 %r2, [t0_param_1];
	; FAST-NEXT: ld.param.b32 %r3, [t0_param_2];
	; FAST-NEXT: fma.rn.f32 %r4, %r1, %r2, %r3;
	; FAST-NEXT: st.param.b32 [func_retval0], %r4;
	; FAST-NEXT: ret;
	;
	; DEFAULT-LABEL: t0(
	; DEFAULT: {
	; DEFAULT-NEXT: .reg .b32 %r<6>;
	; DEFAULT-EMPTY:
	; DEFAULT-NEXT: // %bb.0:
	; DEFAULT-NEXT: ld.param.b32 %r1, [t0_param_0];
	; DEFAULT-NEXT: ld.param.b32 %r2, [t0_param_1];
	; DEFAULT-NEXT: mul.rn.f32 %r3, %r1, %r2;
	; DEFAULT-NEXT: ld.param.b32 %r4, [t0_param_2];
	; DEFAULT-NEXT: add.rn.f32 %r5, %r3, %r4;
	; DEFAULT-NEXT: st.param.b32 [func_retval0], %r5;
	; DEFAULT-NEXT: ret;
	%v0 = fmul float %a, %b
	%v1 = fadd float %v0, %c
	ret float %v1
	}

	;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32
	;; to prevent ptxas from fusing this with anything else.
	define float @t1(float %a, float %b) {
	; FAST-LABEL: t1(
	; FAST: {
	; FAST-NEXT: .reg .b32 %r<6>;
	; FAST-EMPTY:
	; FAST-NEXT: // %bb.0:
	; FAST-NEXT: ld.param.b32 %r1, [t1_param_0];
	; FAST-NEXT: ld.param.b32 %r2, [t1_param_1];
	; FAST-NEXT: add.f32 %r3, %r1, %r2;
	; FAST-NEXT: sub.f32 %r4, %r1, %r2;
	; FAST-NEXT: mul.f32 %r5, %r3, %r4;
	; FAST-NEXT: st.param.b32 [func_retval0], %r5;
	; FAST-NEXT: ret;
	;
	; DEFAULT-LABEL: t1(
	; DEFAULT: {
	; DEFAULT-NEXT: .reg .b32 %r<6>;
	; DEFAULT-EMPTY:
	; DEFAULT-NEXT: // %bb.0:
	; DEFAULT-NEXT: ld.param.b32 %r1, [t1_param_0];
	; DEFAULT-NEXT: ld.param.b32 %r2, [t1_param_1];
	; DEFAULT-NEXT: add.rn.f32 %r3, %r1, %r2;
	; DEFAULT-NEXT: sub.rn.f32 %r4, %r1, %r2;
	; DEFAULT-NEXT: mul.rn.f32 %r5, %r3, %r4;
	; DEFAULT-NEXT: st.param.b32 [func_retval0], %r5;
	; DEFAULT-NEXT: ret;
	%v1 = fadd float %a, %b
	%v2 = fsub float %a, %b
	%v3 = fmul float %v1, %v2
	ret float %v3
	}

	;; Make sure we generate the non ".rn" version when the "contract" flag is
	;; present on the instructions
	define float @t2(float %a, float %b) {
	; CHECK-LABEL: t2(
	; CHECK: {
	; CHECK-NEXT: .reg .b32 %r<6>;
	; CHECK-EMPTY:
	; CHECK-NEXT: // %bb.0:
	; CHECK-NEXT: ld.param.b32 %r1, [t2_param_0];
	; CHECK-NEXT: ld.param.b32 %r2, [t2_param_1];
	; CHECK-NEXT: add.f32 %r3, %r1, %r2;
	; CHECK-NEXT: sub.f32 %r4, %r1, %r2;
	; CHECK-NEXT: mul.f32 %r5, %r3, %r4;
	; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
	; CHECK-NEXT: ret;
	%v1 = fadd contract float %a, %b
	%v2 = fsub contract float %a, %b
	%v3 = fmul contract float %v1, %v2
	ret float %v3
	}

	;; Make sure we always fold to fma when the "contract" flag is present
	define float @t3(float %a, float %b, float %c) {
	; CHECK-LABEL: t3(
	; CHECK: {
	; CHECK-NEXT: .reg .b32 %r<5>;
	; CHECK-EMPTY:
	; CHECK-NEXT: // %bb.0:
	; CHECK-NEXT: ld.param.b32 %r1, [t3_param_0];
	; CHECK-NEXT: ld.param.b32 %r2, [t3_param_1];
	; CHECK-NEXT: ld.param.b32 %r3, [t3_param_2];
	; CHECK-NEXT: fma.rn.f32 %r4, %r1, %r2, %r3;
	; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
	; CHECK-NEXT: ret;
	%v0 = fmul contract float %a, %b
	%v1 = fadd contract float %v0, %c
	ret float %v1
	}