blob: d2ba1395f4e62fb738d616630253f7fe25d9dea9 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,FAST
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefixes=CHECK,DEFAULT
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -fp-contract=fast | %ptxas-verify %}
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 | %ptxas-verify %}
target triple = "nvptx64-unknown-cuda"
;; Make sure we are generating proper instruction sequences for fused ops
;; If fusion is allowed, we try to form fma.rn at the PTX level, and emit
;; add.f32 otherwise. Without an explicit rounding mode on add.f32, ptxas
;; is free to fuse with a multiply if it is able. If fusion is not allowed,
;; we do not form fma.rn at the PTX level and explicitly generate add.rn
;; for all adds to prevent ptxas from fusion the ops.
define float @t0(float %a, float %b, float %c) {
; FAST-LABEL: t0(
; FAST: {
; FAST-NEXT: .reg .b32 %r<5>;
; FAST-EMPTY:
; FAST-NEXT: // %bb.0:
; FAST-NEXT: ld.param.b32 %r1, [t0_param_0];
; FAST-NEXT: ld.param.b32 %r2, [t0_param_1];
; FAST-NEXT: ld.param.b32 %r3, [t0_param_2];
; FAST-NEXT: fma.rn.f32 %r4, %r1, %r2, %r3;
; FAST-NEXT: st.param.b32 [func_retval0], %r4;
; FAST-NEXT: ret;
;
; DEFAULT-LABEL: t0(
; DEFAULT: {
; DEFAULT-NEXT: .reg .b32 %r<6>;
; DEFAULT-EMPTY:
; DEFAULT-NEXT: // %bb.0:
; DEFAULT-NEXT: ld.param.b32 %r1, [t0_param_0];
; DEFAULT-NEXT: ld.param.b32 %r2, [t0_param_1];
; DEFAULT-NEXT: mul.rn.f32 %r3, %r1, %r2;
; DEFAULT-NEXT: ld.param.b32 %r4, [t0_param_2];
; DEFAULT-NEXT: add.rn.f32 %r5, %r3, %r4;
; DEFAULT-NEXT: st.param.b32 [func_retval0], %r5;
; DEFAULT-NEXT: ret;
%v0 = fmul float %a, %b
%v1 = fadd float %v0, %c
ret float %v1
}
;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32
;; to prevent ptxas from fusing this with anything else.
define float @t1(float %a, float %b) {
; FAST-LABEL: t1(
; FAST: {
; FAST-NEXT: .reg .b32 %r<6>;
; FAST-EMPTY:
; FAST-NEXT: // %bb.0:
; FAST-NEXT: ld.param.b32 %r1, [t1_param_0];
; FAST-NEXT: ld.param.b32 %r2, [t1_param_1];
; FAST-NEXT: add.f32 %r3, %r1, %r2;
; FAST-NEXT: sub.f32 %r4, %r1, %r2;
; FAST-NEXT: mul.f32 %r5, %r3, %r4;
; FAST-NEXT: st.param.b32 [func_retval0], %r5;
; FAST-NEXT: ret;
;
; DEFAULT-LABEL: t1(
; DEFAULT: {
; DEFAULT-NEXT: .reg .b32 %r<6>;
; DEFAULT-EMPTY:
; DEFAULT-NEXT: // %bb.0:
; DEFAULT-NEXT: ld.param.b32 %r1, [t1_param_0];
; DEFAULT-NEXT: ld.param.b32 %r2, [t1_param_1];
; DEFAULT-NEXT: add.rn.f32 %r3, %r1, %r2;
; DEFAULT-NEXT: sub.rn.f32 %r4, %r1, %r2;
; DEFAULT-NEXT: mul.rn.f32 %r5, %r3, %r4;
; DEFAULT-NEXT: st.param.b32 [func_retval0], %r5;
; DEFAULT-NEXT: ret;
%v1 = fadd float %a, %b
%v2 = fsub float %a, %b
%v3 = fmul float %v1, %v2
ret float %v3
}
;; Make sure we generate the non ".rn" version when the "contract" flag is
;; present on the instructions
define float @t2(float %a, float %b) {
; CHECK-LABEL: t2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<6>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [t2_param_0];
; CHECK-NEXT: ld.param.b32 %r2, [t2_param_1];
; CHECK-NEXT: add.f32 %r3, %r1, %r2;
; CHECK-NEXT: sub.f32 %r4, %r1, %r2;
; CHECK-NEXT: mul.f32 %r5, %r3, %r4;
; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
; CHECK-NEXT: ret;
%v1 = fadd contract float %a, %b
%v2 = fsub contract float %a, %b
%v3 = fmul contract float %v1, %v2
ret float %v3
}
;; Make sure we always fold to fma when the "contract" flag is present
define float @t3(float %a, float %b, float %c) {
; CHECK-LABEL: t3(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [t3_param_0];
; CHECK-NEXT: ld.param.b32 %r2, [t3_param_1];
; CHECK-NEXT: ld.param.b32 %r3, [t3_param_2];
; CHECK-NEXT: fma.rn.f32 %r4, %r1, %r2, %r3;
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
; CHECK-NEXT: ret;
%v0 = fmul contract float %a, %b
%v1 = fadd contract float %v0, %c
ret float %v1
}