blob: dc0ec0ff7bb0b07fa89ba45486b782384b66baeb [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,FAST
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 | FileCheck %s --check-prefixes=CHECK,DEFAULT
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | %ptxas-verify -arch sm_100 %}
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 | %ptxas-verify -arch sm_100 %}
target triple = "nvptx64-unknown-cuda"
;; FAST-LABEL: @t0
;; DEFAULT-LABEL: @t0
define <2 x float> @t0(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
; FAST-LABEL: t0(
; FAST: {
; FAST-NEXT: .reg .b64 %rd<5>;
; FAST-EMPTY:
; FAST-NEXT: // %bb.0:
; FAST-NEXT: ld.param.b64 %rd1, [t0_param_0];
; FAST-NEXT: ld.param.b64 %rd2, [t0_param_1];
; FAST-NEXT: ld.param.b64 %rd3, [t0_param_2];
; FAST-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3;
; FAST-NEXT: st.param.b64 [func_retval0], %rd4;
; FAST-NEXT: ret;
;
; DEFAULT-LABEL: t0(
; DEFAULT: {
; DEFAULT-NEXT: .reg .b64 %rd<6>;
; DEFAULT-EMPTY:
; DEFAULT-NEXT: // %bb.0:
; DEFAULT-NEXT: ld.param.b64 %rd1, [t0_param_0];
; DEFAULT-NEXT: ld.param.b64 %rd2, [t0_param_1];
; DEFAULT-NEXT: mul.rn.f32x2 %rd3, %rd1, %rd2;
; DEFAULT-NEXT: ld.param.b64 %rd4, [t0_param_2];
; DEFAULT-NEXT: add.rn.f32x2 %rd5, %rd3, %rd4;
; DEFAULT-NEXT: st.param.b64 [func_retval0], %rd5;
; DEFAULT-NEXT: ret;
%v0 = fmul <2 x float> %a, %b
%v1 = fadd <2 x float> %v0, %c
ret <2 x float> %v1
}
;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32x2
;; to prevent ptxas from fusing this with anything else.
define <2 x float> @t1(<2 x float> %a, <2 x float> %b) {
; FAST-LABEL: t1(
; FAST: {
; FAST-NEXT: .reg .b64 %rd<6>;
; FAST-EMPTY:
; FAST-NEXT: // %bb.0:
; FAST-NEXT: ld.param.b64 %rd1, [t1_param_0];
; FAST-NEXT: ld.param.b64 %rd2, [t1_param_1];
; FAST-NEXT: add.f32x2 %rd3, %rd1, %rd2;
; FAST-NEXT: sub.f32x2 %rd4, %rd1, %rd2;
; FAST-NEXT: mul.f32x2 %rd5, %rd3, %rd4;
; FAST-NEXT: st.param.b64 [func_retval0], %rd5;
; FAST-NEXT: ret;
;
; DEFAULT-LABEL: t1(
; DEFAULT: {
; DEFAULT-NEXT: .reg .b64 %rd<6>;
; DEFAULT-EMPTY:
; DEFAULT-NEXT: // %bb.0:
; DEFAULT-NEXT: ld.param.b64 %rd1, [t1_param_0];
; DEFAULT-NEXT: ld.param.b64 %rd2, [t1_param_1];
; DEFAULT-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
; DEFAULT-NEXT: sub.rn.f32x2 %rd4, %rd1, %rd2;
; DEFAULT-NEXT: mul.rn.f32x2 %rd5, %rd3, %rd4;
; DEFAULT-NEXT: st.param.b64 [func_retval0], %rd5;
; DEFAULT-NEXT: ret;
%v1 = fadd <2 x float> %a, %b
%v2 = fsub <2 x float> %a, %b
%v3 = fmul <2 x float> %v1, %v2
ret <2 x float> %v3
}
;; Make sure we generate the non ".rn" version when the "contract" flag is
;; present on the instructions
define <2 x float> @t2(<2 x float> %a, <2 x float> %b) {
; CHECK-LABEL: t2(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [t2_param_0];
; CHECK-NEXT: ld.param.b64 %rd2, [t2_param_1];
; CHECK-NEXT: add.f32x2 %rd3, %rd1, %rd2;
; CHECK-NEXT: sub.f32x2 %rd4, %rd1, %rd2;
; CHECK-NEXT: mul.f32x2 %rd5, %rd3, %rd4;
; CHECK-NEXT: st.param.b64 [func_retval0], %rd5;
; CHECK-NEXT: ret;
%v1 = fadd contract <2 x float> %a, %b
%v2 = fsub contract <2 x float> %a, %b
%v3 = fmul contract <2 x float> %v1, %v2
ret <2 x float> %v3
}
;; Make sure we always fold to fma when the "contract" flag is present
define <2 x float> @t3(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
; CHECK-LABEL: t3(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [t3_param_0];
; CHECK-NEXT: ld.param.b64 %rd2, [t3_param_1];
; CHECK-NEXT: ld.param.b64 %rd3, [t3_param_2];
; CHECK-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3;
; CHECK-NEXT: st.param.b64 [func_retval0], %rd4;
; CHECK-NEXT: ret;
%v0 = fmul contract <2 x float> %a, %b
%v1 = fadd contract <2 x float> %v0, %c
ret <2 x float> %v1
}