| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,FAST |
| ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefixes=CHECK,DEFAULT |
| ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -fp-contract=fast | %ptxas-verify %} |
| ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 | %ptxas-verify %} |
| |
| target triple = "nvptx64-unknown-cuda" |
| |
| ;; Make sure we are generating proper instruction sequences for fused ops |
| ;; If fusion is allowed, we try to form fma.rn at the PTX level, and emit |
| ;; add.f32 otherwise. Without an explicit rounding mode on add.f32, ptxas |
| ;; is free to fuse with a multiply if it is able. If fusion is not allowed, |
| ;; we do not form fma.rn at the PTX level and explicitly generate add.rn |
| ;; for all adds to prevent ptxas from fusion the ops. |
| define float @t0(float %a, float %b, float %c) { |
| ; FAST-LABEL: t0( |
| ; FAST: { |
| ; FAST-NEXT: .reg .b32 %r<5>; |
| ; FAST-EMPTY: |
| ; FAST-NEXT: // %bb.0: |
| ; FAST-NEXT: ld.param.b32 %r1, [t0_param_0]; |
| ; FAST-NEXT: ld.param.b32 %r2, [t0_param_1]; |
| ; FAST-NEXT: ld.param.b32 %r3, [t0_param_2]; |
| ; FAST-NEXT: fma.rn.f32 %r4, %r1, %r2, %r3; |
| ; FAST-NEXT: st.param.b32 [func_retval0], %r4; |
| ; FAST-NEXT: ret; |
| ; |
| ; DEFAULT-LABEL: t0( |
| ; DEFAULT: { |
| ; DEFAULT-NEXT: .reg .b32 %r<6>; |
| ; DEFAULT-EMPTY: |
| ; DEFAULT-NEXT: // %bb.0: |
| ; DEFAULT-NEXT: ld.param.b32 %r1, [t0_param_0]; |
| ; DEFAULT-NEXT: ld.param.b32 %r2, [t0_param_1]; |
| ; DEFAULT-NEXT: mul.rn.f32 %r3, %r1, %r2; |
| ; DEFAULT-NEXT: ld.param.b32 %r4, [t0_param_2]; |
| ; DEFAULT-NEXT: add.rn.f32 %r5, %r3, %r4; |
| ; DEFAULT-NEXT: st.param.b32 [func_retval0], %r5; |
| ; DEFAULT-NEXT: ret; |
| %v0 = fmul float %a, %b |
| %v1 = fadd float %v0, %c |
| ret float %v1 |
| } |
| |
| ;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32 |
| ;; to prevent ptxas from fusing this with anything else. |
| define float @t1(float %a, float %b) { |
| ; FAST-LABEL: t1( |
| ; FAST: { |
| ; FAST-NEXT: .reg .b32 %r<6>; |
| ; FAST-EMPTY: |
| ; FAST-NEXT: // %bb.0: |
| ; FAST-NEXT: ld.param.b32 %r1, [t1_param_0]; |
| ; FAST-NEXT: ld.param.b32 %r2, [t1_param_1]; |
| ; FAST-NEXT: add.f32 %r3, %r1, %r2; |
| ; FAST-NEXT: sub.f32 %r4, %r1, %r2; |
| ; FAST-NEXT: mul.f32 %r5, %r3, %r4; |
| ; FAST-NEXT: st.param.b32 [func_retval0], %r5; |
| ; FAST-NEXT: ret; |
| ; |
| ; DEFAULT-LABEL: t1( |
| ; DEFAULT: { |
| ; DEFAULT-NEXT: .reg .b32 %r<6>; |
| ; DEFAULT-EMPTY: |
| ; DEFAULT-NEXT: // %bb.0: |
| ; DEFAULT-NEXT: ld.param.b32 %r1, [t1_param_0]; |
| ; DEFAULT-NEXT: ld.param.b32 %r2, [t1_param_1]; |
| ; DEFAULT-NEXT: add.rn.f32 %r3, %r1, %r2; |
| ; DEFAULT-NEXT: sub.rn.f32 %r4, %r1, %r2; |
| ; DEFAULT-NEXT: mul.rn.f32 %r5, %r3, %r4; |
| ; DEFAULT-NEXT: st.param.b32 [func_retval0], %r5; |
| ; DEFAULT-NEXT: ret; |
| %v1 = fadd float %a, %b |
| %v2 = fsub float %a, %b |
| %v3 = fmul float %v1, %v2 |
| ret float %v3 |
| } |
| |
| ;; Make sure we generate the non ".rn" version when the "contract" flag is |
| ;; present on the instructions |
| define float @t2(float %a, float %b) { |
| ; CHECK-LABEL: t2( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<6>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.b32 %r1, [t2_param_0]; |
| ; CHECK-NEXT: ld.param.b32 %r2, [t2_param_1]; |
| ; CHECK-NEXT: add.f32 %r3, %r1, %r2; |
| ; CHECK-NEXT: sub.f32 %r4, %r1, %r2; |
| ; CHECK-NEXT: mul.f32 %r5, %r3, %r4; |
| ; CHECK-NEXT: st.param.b32 [func_retval0], %r5; |
| ; CHECK-NEXT: ret; |
| %v1 = fadd contract float %a, %b |
| %v2 = fsub contract float %a, %b |
| %v3 = fmul contract float %v1, %v2 |
| ret float %v3 |
| } |
| |
| ;; Make sure we always fold to fma when the "contract" flag is present |
| define float @t3(float %a, float %b, float %c) { |
| ; CHECK-LABEL: t3( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<5>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.b32 %r1, [t3_param_0]; |
| ; CHECK-NEXT: ld.param.b32 %r2, [t3_param_1]; |
| ; CHECK-NEXT: ld.param.b32 %r3, [t3_param_2]; |
| ; CHECK-NEXT: fma.rn.f32 %r4, %r1, %r2, %r3; |
| ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; |
| ; CHECK-NEXT: ret; |
| %v0 = fmul contract float %a, %b |
| %v1 = fadd contract float %v0, %c |
| ret float %v1 |
| } |