| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,FAST |
| ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 | FileCheck %s --check-prefixes=CHECK,DEFAULT |
| ; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | %ptxas-verify -arch sm_100 %} |
| ; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 | %ptxas-verify -arch sm_100 %} |
| |
| target triple = "nvptx64-unknown-cuda" |
| |
| ;; FAST-LABEL: @t0 |
| ;; DEFAULT-LABEL: @t0 |
| define <2 x float> @t0(<2 x float> %a, <2 x float> %b, <2 x float> %c) { |
| ; FAST-LABEL: t0( |
| ; FAST: { |
| ; FAST-NEXT: .reg .b64 %rd<5>; |
| ; FAST-EMPTY: |
| ; FAST-NEXT: // %bb.0: |
| ; FAST-NEXT: ld.param.b64 %rd1, [t0_param_0]; |
| ; FAST-NEXT: ld.param.b64 %rd2, [t0_param_1]; |
| ; FAST-NEXT: ld.param.b64 %rd3, [t0_param_2]; |
| ; FAST-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3; |
| ; FAST-NEXT: st.param.b64 [func_retval0], %rd4; |
| ; FAST-NEXT: ret; |
| ; |
| ; DEFAULT-LABEL: t0( |
| ; DEFAULT: { |
| ; DEFAULT-NEXT: .reg .b64 %rd<6>; |
| ; DEFAULT-EMPTY: |
| ; DEFAULT-NEXT: // %bb.0: |
| ; DEFAULT-NEXT: ld.param.b64 %rd1, [t0_param_0]; |
| ; DEFAULT-NEXT: ld.param.b64 %rd2, [t0_param_1]; |
| ; DEFAULT-NEXT: mul.rn.f32x2 %rd3, %rd1, %rd2; |
| ; DEFAULT-NEXT: ld.param.b64 %rd4, [t0_param_2]; |
| ; DEFAULT-NEXT: add.rn.f32x2 %rd5, %rd3, %rd4; |
| ; DEFAULT-NEXT: st.param.b64 [func_retval0], %rd5; |
| ; DEFAULT-NEXT: ret; |
| %v0 = fmul <2 x float> %a, %b |
| %v1 = fadd <2 x float> %v0, %c |
| ret <2 x float> %v1 |
| } |
| |
| ;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32x2 |
| ;; to prevent ptxas from fusing this with anything else. |
| define <2 x float> @t1(<2 x float> %a, <2 x float> %b) { |
| ; FAST-LABEL: t1( |
| ; FAST: { |
| ; FAST-NEXT: .reg .b64 %rd<6>; |
| ; FAST-EMPTY: |
| ; FAST-NEXT: // %bb.0: |
| ; FAST-NEXT: ld.param.b64 %rd1, [t1_param_0]; |
| ; FAST-NEXT: ld.param.b64 %rd2, [t1_param_1]; |
| ; FAST-NEXT: add.f32x2 %rd3, %rd1, %rd2; |
| ; FAST-NEXT: sub.f32x2 %rd4, %rd1, %rd2; |
| ; FAST-NEXT: mul.f32x2 %rd5, %rd3, %rd4; |
| ; FAST-NEXT: st.param.b64 [func_retval0], %rd5; |
| ; FAST-NEXT: ret; |
| ; |
| ; DEFAULT-LABEL: t1( |
| ; DEFAULT: { |
| ; DEFAULT-NEXT: .reg .b64 %rd<6>; |
| ; DEFAULT-EMPTY: |
| ; DEFAULT-NEXT: // %bb.0: |
| ; DEFAULT-NEXT: ld.param.b64 %rd1, [t1_param_0]; |
| ; DEFAULT-NEXT: ld.param.b64 %rd2, [t1_param_1]; |
| ; DEFAULT-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; |
| ; DEFAULT-NEXT: sub.rn.f32x2 %rd4, %rd1, %rd2; |
| ; DEFAULT-NEXT: mul.rn.f32x2 %rd5, %rd3, %rd4; |
| ; DEFAULT-NEXT: st.param.b64 [func_retval0], %rd5; |
| ; DEFAULT-NEXT: ret; |
| %v1 = fadd <2 x float> %a, %b |
| %v2 = fsub <2 x float> %a, %b |
| %v3 = fmul <2 x float> %v1, %v2 |
| ret <2 x float> %v3 |
| } |
| |
| ;; Make sure we generate the non ".rn" version when the "contract" flag is |
| ;; present on the instructions |
| define <2 x float> @t2(<2 x float> %a, <2 x float> %b) { |
| ; CHECK-LABEL: t2( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<6>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.b64 %rd1, [t2_param_0]; |
| ; CHECK-NEXT: ld.param.b64 %rd2, [t2_param_1]; |
| ; CHECK-NEXT: add.f32x2 %rd3, %rd1, %rd2; |
| ; CHECK-NEXT: sub.f32x2 %rd4, %rd1, %rd2; |
| ; CHECK-NEXT: mul.f32x2 %rd5, %rd3, %rd4; |
| ; CHECK-NEXT: st.param.b64 [func_retval0], %rd5; |
| ; CHECK-NEXT: ret; |
| %v1 = fadd contract <2 x float> %a, %b |
| %v2 = fsub contract <2 x float> %a, %b |
| %v3 = fmul contract <2 x float> %v1, %v2 |
| ret <2 x float> %v3 |
| } |
| |
| ;; Make sure we always fold to fma when the "contract" flag is present |
| define <2 x float> @t3(<2 x float> %a, <2 x float> %b, <2 x float> %c) { |
| ; CHECK-LABEL: t3( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<5>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.b64 %rd1, [t3_param_0]; |
| ; CHECK-NEXT: ld.param.b64 %rd2, [t3_param_1]; |
| ; CHECK-NEXT: ld.param.b64 %rd3, [t3_param_2]; |
| ; CHECK-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3; |
| ; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; |
| ; CHECK-NEXT: ret; |
| %v0 = fmul contract <2 x float> %a, %b |
| %v1 = fadd contract <2 x float> %v0, %c |
| ret <2 x float> %v1 |
| } |