blob: 21ca041f6220a1961d8772ab67ddb03f6136d53f [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mcpu=sm_90a -O0 -disable-post-ra -frame-pointer=all \
; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-SM90A %s
; RUN: %if ptxas-12.7 %{ \
; RUN: llc < %s -mcpu=sm_90a -O0 -disable-post-ra -frame-pointer=all \
; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_90a \
; RUN: %}
; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \
; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-SM100 %s
; RUN: %if ptxas-12.7 %{ \
; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \
; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_100 \
; RUN: %}
; Test that v2i32 -> v2f32 conversions don't emit bitwise operations on i64.
target triple = "nvptx64-nvidia-cuda"
declare <2 x i32> @return_i32x2(i32 %0)
; Test with v2i32.
define ptx_kernel void @store_i32x2(i32 %0, ptr %p) {
; CHECK-SM90A-LABEL: store_i32x2(
; CHECK-SM90A: {
; CHECK-SM90A-NEXT: .reg .b32 %r<6>;
; CHECK-SM90A-NEXT: .reg .b64 %rd<2>;
; CHECK-SM90A-EMPTY:
; CHECK-SM90A-NEXT: // %bb.0:
; CHECK-SM90A-NEXT: ld.param.b64 %rd1, [store_i32x2_param_1];
; CHECK-SM90A-NEXT: ld.param.b32 %r1, [store_i32x2_param_0];
; CHECK-SM90A-NEXT: { // callseq 0, 0
; CHECK-SM90A-NEXT: .param .b32 param0;
; CHECK-SM90A-NEXT: .param .align 8 .b8 retval0[8];
; CHECK-SM90A-NEXT: st.param.b32 [param0], %r1;
; CHECK-SM90A-NEXT: call.uni (retval0), return_i32x2, (param0);
; CHECK-SM90A-NEXT: ld.param.v2.b32 {%r2, %r3}, [retval0];
; CHECK-SM90A-NEXT: } // callseq 0
; CHECK-SM90A-NEXT: add.rn.f32 %r4, %r3, %r3;
; CHECK-SM90A-NEXT: add.rn.f32 %r5, %r2, %r2;
; CHECK-SM90A-NEXT: st.v2.b32 [%rd1], {%r5, %r4};
; CHECK-SM90A-NEXT: ret;
;
; CHECK-SM100-LABEL: store_i32x2(
; CHECK-SM100: {
; CHECK-SM100-NEXT: .reg .b32 %r<2>;
; CHECK-SM100-NEXT: .reg .b64 %rd<4>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
; CHECK-SM100-NEXT: ld.param.b64 %rd1, [store_i32x2_param_1];
; CHECK-SM100-NEXT: ld.param.b32 %r1, [store_i32x2_param_0];
; CHECK-SM100-NEXT: { // callseq 0, 0
; CHECK-SM100-NEXT: .param .b32 param0;
; CHECK-SM100-NEXT: .param .align 8 .b8 retval0[8];
; CHECK-SM100-NEXT: st.param.b32 [param0], %r1;
; CHECK-SM100-NEXT: call.uni (retval0), return_i32x2, (param0);
; CHECK-SM100-NEXT: ld.param.b64 %rd2, [retval0];
; CHECK-SM100-NEXT: } // callseq 0
; CHECK-SM100-NEXT: add.rn.f32x2 %rd3, %rd2, %rd2;
; CHECK-SM100-NEXT: st.b64 [%rd1], %rd3;
; CHECK-SM100-NEXT: ret;
%v = call <2 x i32> @return_i32x2(i32 %0)
%v.f32x2 = bitcast <2 x i32> %v to <2 x float>
%res = fadd <2 x float> %v.f32x2, %v.f32x2
store <2 x float> %res, ptr %p, align 8
ret void
}
; Test with inline ASM returning { <1 x float>, <1 x float> }, which decays to
; v2i32.
define ptx_kernel void @inlineasm(ptr %p) {
; CHECK-SM90A-LABEL: inlineasm(
; CHECK-SM90A: {
; CHECK-SM90A-NEXT: .reg .b32 %r<7>;
; CHECK-SM90A-NEXT: .reg .b64 %rd<2>;
; CHECK-SM90A-EMPTY:
; CHECK-SM90A-NEXT: // %bb.0:
; CHECK-SM90A-NEXT: ld.param.b64 %rd1, [inlineasm_param_0];
; CHECK-SM90A-NEXT: mov.b32 %r3, 0;
; CHECK-SM90A-NEXT: mov.b32 %r4, %r3;
; CHECK-SM90A-NEXT: mov.b32 %r2, %r4;
; CHECK-SM90A-NEXT: mov.b32 %r1, %r3;
; CHECK-SM90A-NEXT: // begin inline asm
; CHECK-SM90A-NEXT: // nop
; CHECK-SM90A-NEXT: // end inline asm
; CHECK-SM90A-NEXT: mul.rn.f32 %r5, %r2, 0f00000000;
; CHECK-SM90A-NEXT: mul.rn.f32 %r6, %r1, 0f00000000;
; CHECK-SM90A-NEXT: st.v2.b32 [%rd1], {%r6, %r5};
; CHECK-SM90A-NEXT: ret;
;
; CHECK-SM100-LABEL: inlineasm(
; CHECK-SM100: {
; CHECK-SM100-NEXT: .reg .b32 %r<6>;
; CHECK-SM100-NEXT: .reg .b64 %rd<5>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
; CHECK-SM100-NEXT: ld.param.b64 %rd1, [inlineasm_param_0];
; CHECK-SM100-NEXT: mov.b32 %r3, 0;
; CHECK-SM100-NEXT: mov.b32 %r4, %r3;
; CHECK-SM100-NEXT: mov.b32 %r2, %r4;
; CHECK-SM100-NEXT: mov.b32 %r1, %r3;
; CHECK-SM100-NEXT: // begin inline asm
; CHECK-SM100-NEXT: // nop
; CHECK-SM100-NEXT: // end inline asm
; CHECK-SM100-NEXT: mov.b64 %rd2, {%r1, %r2};
; CHECK-SM100-NEXT: mov.b32 %r5, 0f00000000;
; CHECK-SM100-NEXT: mov.b64 %rd3, {%r5, %r5};
; CHECK-SM100-NEXT: mul.rn.f32x2 %rd4, %rd2, %rd3;
; CHECK-SM100-NEXT: st.b64 [%rd1], %rd4;
; CHECK-SM100-NEXT: ret;
%r = call { <1 x float>, <1 x float> } asm sideeffect "// nop", "=f,=f,0,1"(<1 x float> zeroinitializer, <1 x float> zeroinitializer)
%i0 = extractvalue { <1 x float>, <1 x float> } %r, 0
%i1 = extractvalue { <1 x float>, <1 x float> } %r, 1
%i4 = shufflevector <1 x float> %i0, <1 x float> %i1, <2 x i32> <i32 0, i32 1>
%mul = fmul < 2 x float> %i4, zeroinitializer
store <2 x float> %mul, ptr %p, align 8
ret void
}
define ptx_kernel void @trunc_v2i32(<2 x i32> %0) {
; CHECK-SM90A-LABEL: trunc_v2i32(
; CHECK-SM90A: {
; CHECK-SM90A-NEXT: .reg .b32 %r<7>;
; CHECK-SM90A-NEXT: .reg .b64 %rd<2>;
; CHECK-SM90A-EMPTY:
; CHECK-SM90A-NEXT: // %bb.0:
; CHECK-SM90A-NEXT: ld.param.v2.b32 {%r1, %r2}, [trunc_v2i32_param_0];
; CHECK-SM90A-NEXT: prmt.b32 %r3, %r1, %r2, 0x3340U;
; CHECK-SM90A-NEXT: mov.b32 %r4, 0;
; CHECK-SM90A-NEXT: prmt.b32 %r5, %r4, 0, 0x3340U;
; CHECK-SM90A-NEXT: prmt.b32 %r6, %r5, %r3, 0x5410U;
; CHECK-SM90A-NEXT: mov.b64 %rd1, 0;
; CHECK-SM90A-NEXT: st.b32 [%rd1], %r6;
; CHECK-SM90A-NEXT: ret;
;
; CHECK-SM100-LABEL: trunc_v2i32(
; CHECK-SM100: {
; CHECK-SM100-NEXT: .reg .b32 %r<7>;
; CHECK-SM100-NEXT: .reg .b64 %rd<3>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
; CHECK-SM100-NEXT: ld.param.b64 %rd1, [trunc_v2i32_param_0];
; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd1;
; CHECK-SM100-NEXT: mov.b32 %r3, 0;
; CHECK-SM100-NEXT: prmt.b32 %r4, %r3, 0, 0x3340U;
; CHECK-SM100-NEXT: prmt.b32 %r5, %r1, %r2, 0x3340U;
; CHECK-SM100-NEXT: prmt.b32 %r6, %r4, %r5, 0x5410U;
; CHECK-SM100-NEXT: mov.b64 %rd2, 0;
; CHECK-SM100-NEXT: st.b32 [%rd2], %r6;
; CHECK-SM100-NEXT: ret;
%2 = trunc <2 x i32> %0 to <2 x i8>
%3 = shufflevector <2 x i8> zeroinitializer, <2 x i8> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
store <4 x i8> %3, ptr null, align 4
ret void
}
define ptx_kernel void @zextend_to_v2i32(<2 x i8> %0) {
; CHECK-SM90A-LABEL: zextend_to_v2i32(
; CHECK-SM90A: {
; CHECK-SM90A-NEXT: .reg .b16 %rs<3>;
; CHECK-SM90A-NEXT: .reg .b32 %r<4>;
; CHECK-SM90A-NEXT: .reg .b64 %rd<5>;
; CHECK-SM90A-EMPTY:
; CHECK-SM90A-NEXT: // %bb.0:
; CHECK-SM90A-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [zextend_to_v2i32_param_0];
; CHECK-SM90A-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-SM90A-NEXT: cvt.u32.u16 %r2, %rs1;
; CHECK-SM90A-NEXT: cvt.u32.u16 %r3, %rs2;
; CHECK-SM90A-NEXT: mov.b64 %rd1, 12;
; CHECK-SM90A-NEXT: st.b32 [%rd1], %r3;
; CHECK-SM90A-NEXT: mov.b64 %rd2, 8;
; CHECK-SM90A-NEXT: st.b32 [%rd2], %r2;
; CHECK-SM90A-NEXT: mov.b64 %rd3, 4;
; CHECK-SM90A-NEXT: st.b32 [%rd3], 0;
; CHECK-SM90A-NEXT: mov.b64 %rd4, 0;
; CHECK-SM90A-NEXT: st.b32 [%rd4], 0;
; CHECK-SM90A-NEXT: ret;
;
; CHECK-SM100-LABEL: zextend_to_v2i32(
; CHECK-SM100: {
; CHECK-SM100-NEXT: .reg .b16 %rs<3>;
; CHECK-SM100-NEXT: .reg .b32 %r<5>;
; CHECK-SM100-NEXT: .reg .b64 %rd<8>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
; CHECK-SM100-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [zextend_to_v2i32_param_0];
; CHECK-SM100-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-SM100-NEXT: cvt.u32.u16 %r2, %rs2;
; CHECK-SM100-NEXT: cvt.u32.u16 %r3, %rs1;
; CHECK-SM100-NEXT: mov.b64 %rd1, {%r3, %r2};
; CHECK-SM100-NEXT: mov.b32 %r4, 0;
; CHECK-SM100-NEXT: mov.b64 %rd2, {%r4, %r4};
; CHECK-SM100-NEXT: mov.b64 %rd3, 4;
; CHECK-SM100-NEXT: st.b32 [%rd3], %rd2;
; CHECK-SM100-NEXT: mov.b64 %rd4, 0;
; CHECK-SM100-NEXT: st.b32 [%rd4], %rd2;
; CHECK-SM100-NEXT: mov.b64 %rd5, 8;
; CHECK-SM100-NEXT: st.b32 [%rd5], %rd1;
; CHECK-SM100-NEXT: shr.u64 %rd6, %rd1, 32;
; CHECK-SM100-NEXT: mov.b64 %rd7, 12;
; CHECK-SM100-NEXT: st.b32 [%rd7], %rd6;
; CHECK-SM100-NEXT: ret;
%2 = zext <2 x i8> %0 to <2 x i32>
%3 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
store <4 x i32> %3, ptr null, align 4
ret void
}
define ptx_kernel void @sextend_to_v2i32(<2 x i8> %0) {
; CHECK-SM90A-LABEL: sextend_to_v2i32(
; CHECK-SM90A: {
; CHECK-SM90A-NEXT: .reg .b16 %rs<3>;
; CHECK-SM90A-NEXT: .reg .b32 %r<6>;
; CHECK-SM90A-NEXT: .reg .b64 %rd<5>;
; CHECK-SM90A-EMPTY:
; CHECK-SM90A-NEXT: // %bb.0:
; CHECK-SM90A-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [sextend_to_v2i32_param_0];
; CHECK-SM90A-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-SM90A-NEXT: cvt.u32.u16 %r2, %rs1;
; CHECK-SM90A-NEXT: cvt.s32.s8 %r3, %r2;
; CHECK-SM90A-NEXT: cvt.u32.u16 %r4, %rs2;
; CHECK-SM90A-NEXT: cvt.s32.s8 %r5, %r4;
; CHECK-SM90A-NEXT: mov.b64 %rd1, 12;
; CHECK-SM90A-NEXT: st.b32 [%rd1], %r5;
; CHECK-SM90A-NEXT: mov.b64 %rd2, 8;
; CHECK-SM90A-NEXT: st.b32 [%rd2], %r3;
; CHECK-SM90A-NEXT: mov.b64 %rd3, 4;
; CHECK-SM90A-NEXT: st.b32 [%rd3], 0;
; CHECK-SM90A-NEXT: mov.b64 %rd4, 0;
; CHECK-SM90A-NEXT: st.b32 [%rd4], 0;
; CHECK-SM90A-NEXT: ret;
;
; CHECK-SM100-LABEL: sextend_to_v2i32(
; CHECK-SM100: {
; CHECK-SM100-NEXT: .reg .b16 %rs<3>;
; CHECK-SM100-NEXT: .reg .b32 %r<7>;
; CHECK-SM100-NEXT: .reg .b64 %rd<8>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
; CHECK-SM100-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [sextend_to_v2i32_param_0];
; CHECK-SM100-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-SM100-NEXT: cvt.u32.u16 %r2, %rs2;
; CHECK-SM100-NEXT: cvt.s32.s8 %r3, %r2;
; CHECK-SM100-NEXT: cvt.u32.u16 %r4, %rs1;
; CHECK-SM100-NEXT: cvt.s32.s8 %r5, %r4;
; CHECK-SM100-NEXT: mov.b64 %rd1, {%r5, %r3};
; CHECK-SM100-NEXT: mov.b32 %r6, 0;
; CHECK-SM100-NEXT: mov.b64 %rd2, {%r6, %r6};
; CHECK-SM100-NEXT: mov.b64 %rd3, 4;
; CHECK-SM100-NEXT: st.b32 [%rd3], %rd2;
; CHECK-SM100-NEXT: mov.b64 %rd4, 0;
; CHECK-SM100-NEXT: st.b32 [%rd4], %rd2;
; CHECK-SM100-NEXT: mov.b64 %rd5, 8;
; CHECK-SM100-NEXT: st.b32 [%rd5], %rd1;
; CHECK-SM100-NEXT: shr.u64 %rd6, %rd1, 32;
; CHECK-SM100-NEXT: mov.b64 %rd7, 12;
; CHECK-SM100-NEXT: st.b32 [%rd7], %rd6;
; CHECK-SM100-NEXT: ret;
%2 = sext <2 x i8> %0 to <2 x i32>
%3 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
store <4 x i32> %3, ptr null, align 4
ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}