| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc < %s -mcpu=sm_90a -O0 -disable-post-ra -frame-pointer=all \ |
| ; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-SM90A %s |
| ; RUN: %if ptxas-12.7 %{ \ |
| ; RUN: llc < %s -mcpu=sm_90a -O0 -disable-post-ra -frame-pointer=all \ |
| ; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_90a \ |
| ; RUN: %} |
| ; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \ |
| ; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-SM100 %s |
| ; RUN: %if ptxas-12.7 %{ \ |
| ; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \ |
| ; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_100 \ |
| ; RUN: %} |
| |
| ; Test that v2i32 -> v2f32 conversions don't emit bitwise operations on i64. |
| |
| target triple = "nvptx64-nvidia-cuda" |
| |
| declare <2 x i32> @return_i32x2(i32 %0) |
| |
| ; Test with v2i32. |
| define ptx_kernel void @store_i32x2(i32 %0, ptr %p) { |
| ; CHECK-SM90A-LABEL: store_i32x2( |
| ; CHECK-SM90A: { |
| ; CHECK-SM90A-NEXT: .reg .b32 %r<6>; |
| ; CHECK-SM90A-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-SM90A-EMPTY: |
| ; CHECK-SM90A-NEXT: // %bb.0: |
| ; CHECK-SM90A-NEXT: ld.param.b64 %rd1, [store_i32x2_param_1]; |
| ; CHECK-SM90A-NEXT: ld.param.b32 %r1, [store_i32x2_param_0]; |
| ; CHECK-SM90A-NEXT: { // callseq 0, 0 |
| ; CHECK-SM90A-NEXT: .param .b32 param0; |
| ; CHECK-SM90A-NEXT: .param .align 8 .b8 retval0[8]; |
| ; CHECK-SM90A-NEXT: st.param.b32 [param0], %r1; |
| ; CHECK-SM90A-NEXT: call.uni (retval0), return_i32x2, (param0); |
| ; CHECK-SM90A-NEXT: ld.param.v2.b32 {%r2, %r3}, [retval0]; |
| ; CHECK-SM90A-NEXT: } // callseq 0 |
| ; CHECK-SM90A-NEXT: add.rn.f32 %r4, %r3, %r3; |
| ; CHECK-SM90A-NEXT: add.rn.f32 %r5, %r2, %r2; |
| ; CHECK-SM90A-NEXT: st.v2.b32 [%rd1], {%r5, %r4}; |
| ; CHECK-SM90A-NEXT: ret; |
| ; |
| ; CHECK-SM100-LABEL: store_i32x2( |
| ; CHECK-SM100: { |
| ; CHECK-SM100-NEXT: .reg .b32 %r<2>; |
| ; CHECK-SM100-NEXT: .reg .b64 %rd<4>; |
| ; CHECK-SM100-EMPTY: |
| ; CHECK-SM100-NEXT: // %bb.0: |
| ; CHECK-SM100-NEXT: ld.param.b64 %rd1, [store_i32x2_param_1]; |
| ; CHECK-SM100-NEXT: ld.param.b32 %r1, [store_i32x2_param_0]; |
| ; CHECK-SM100-NEXT: { // callseq 0, 0 |
| ; CHECK-SM100-NEXT: .param .b32 param0; |
| ; CHECK-SM100-NEXT: .param .align 8 .b8 retval0[8]; |
| ; CHECK-SM100-NEXT: st.param.b32 [param0], %r1; |
| ; CHECK-SM100-NEXT: call.uni (retval0), return_i32x2, (param0); |
| ; CHECK-SM100-NEXT: ld.param.b64 %rd2, [retval0]; |
| ; CHECK-SM100-NEXT: } // callseq 0 |
| ; CHECK-SM100-NEXT: add.rn.f32x2 %rd3, %rd2, %rd2; |
| ; CHECK-SM100-NEXT: st.b64 [%rd1], %rd3; |
| ; CHECK-SM100-NEXT: ret; |
| %v = call <2 x i32> @return_i32x2(i32 %0) |
| %v.f32x2 = bitcast <2 x i32> %v to <2 x float> |
| %res = fadd <2 x float> %v.f32x2, %v.f32x2 |
| store <2 x float> %res, ptr %p, align 8 |
| ret void |
| } |
| |
| ; Test with inline ASM returning { <1 x float>, <1 x float> }, which decays to |
| ; v2i32. |
| define ptx_kernel void @inlineasm(ptr %p) { |
| ; CHECK-SM90A-LABEL: inlineasm( |
| ; CHECK-SM90A: { |
| ; CHECK-SM90A-NEXT: .reg .b32 %r<7>; |
| ; CHECK-SM90A-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-SM90A-EMPTY: |
| ; CHECK-SM90A-NEXT: // %bb.0: |
| ; CHECK-SM90A-NEXT: ld.param.b64 %rd1, [inlineasm_param_0]; |
| ; CHECK-SM90A-NEXT: mov.b32 %r3, 0; |
| ; CHECK-SM90A-NEXT: mov.b32 %r4, %r3; |
| ; CHECK-SM90A-NEXT: mov.b32 %r2, %r4; |
| ; CHECK-SM90A-NEXT: mov.b32 %r1, %r3; |
| ; CHECK-SM90A-NEXT: // begin inline asm |
| ; CHECK-SM90A-NEXT: // nop |
| ; CHECK-SM90A-NEXT: // end inline asm |
| ; CHECK-SM90A-NEXT: mul.rn.f32 %r5, %r2, 0f00000000; |
| ; CHECK-SM90A-NEXT: mul.rn.f32 %r6, %r1, 0f00000000; |
| ; CHECK-SM90A-NEXT: st.v2.b32 [%rd1], {%r6, %r5}; |
| ; CHECK-SM90A-NEXT: ret; |
| ; |
| ; CHECK-SM100-LABEL: inlineasm( |
| ; CHECK-SM100: { |
| ; CHECK-SM100-NEXT: .reg .b32 %r<6>; |
| ; CHECK-SM100-NEXT: .reg .b64 %rd<5>; |
| ; CHECK-SM100-EMPTY: |
| ; CHECK-SM100-NEXT: // %bb.0: |
| ; CHECK-SM100-NEXT: ld.param.b64 %rd1, [inlineasm_param_0]; |
| ; CHECK-SM100-NEXT: mov.b32 %r3, 0; |
| ; CHECK-SM100-NEXT: mov.b32 %r4, %r3; |
| ; CHECK-SM100-NEXT: mov.b32 %r2, %r4; |
| ; CHECK-SM100-NEXT: mov.b32 %r1, %r3; |
| ; CHECK-SM100-NEXT: // begin inline asm |
| ; CHECK-SM100-NEXT: // nop |
| ; CHECK-SM100-NEXT: // end inline asm |
| ; CHECK-SM100-NEXT: mov.b64 %rd2, {%r1, %r2}; |
| ; CHECK-SM100-NEXT: mov.b32 %r5, 0f00000000; |
| ; CHECK-SM100-NEXT: mov.b64 %rd3, {%r5, %r5}; |
| ; CHECK-SM100-NEXT: mul.rn.f32x2 %rd4, %rd2, %rd3; |
| ; CHECK-SM100-NEXT: st.b64 [%rd1], %rd4; |
| ; CHECK-SM100-NEXT: ret; |
| %r = call { <1 x float>, <1 x float> } asm sideeffect "// nop", "=f,=f,0,1"(<1 x float> zeroinitializer, <1 x float> zeroinitializer) |
| %i0 = extractvalue { <1 x float>, <1 x float> } %r, 0 |
| %i1 = extractvalue { <1 x float>, <1 x float> } %r, 1 |
| %i4 = shufflevector <1 x float> %i0, <1 x float> %i1, <2 x i32> <i32 0, i32 1> |
| %mul = fmul < 2 x float> %i4, zeroinitializer |
| store <2 x float> %mul, ptr %p, align 8 |
| ret void |
| } |
| |
| define ptx_kernel void @trunc_v2i32(<2 x i32> %0) { |
| ; CHECK-SM90A-LABEL: trunc_v2i32( |
| ; CHECK-SM90A: { |
| ; CHECK-SM90A-NEXT: .reg .b32 %r<7>; |
| ; CHECK-SM90A-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-SM90A-EMPTY: |
| ; CHECK-SM90A-NEXT: // %bb.0: |
| ; CHECK-SM90A-NEXT: ld.param.v2.b32 {%r1, %r2}, [trunc_v2i32_param_0]; |
| ; CHECK-SM90A-NEXT: prmt.b32 %r3, %r1, %r2, 0x3340U; |
| ; CHECK-SM90A-NEXT: mov.b32 %r4, 0; |
| ; CHECK-SM90A-NEXT: prmt.b32 %r5, %r4, 0, 0x3340U; |
| ; CHECK-SM90A-NEXT: prmt.b32 %r6, %r5, %r3, 0x5410U; |
| ; CHECK-SM90A-NEXT: mov.b64 %rd1, 0; |
| ; CHECK-SM90A-NEXT: st.b32 [%rd1], %r6; |
| ; CHECK-SM90A-NEXT: ret; |
| ; |
| ; CHECK-SM100-LABEL: trunc_v2i32( |
| ; CHECK-SM100: { |
| ; CHECK-SM100-NEXT: .reg .b32 %r<7>; |
| ; CHECK-SM100-NEXT: .reg .b64 %rd<3>; |
| ; CHECK-SM100-EMPTY: |
| ; CHECK-SM100-NEXT: // %bb.0: |
| ; CHECK-SM100-NEXT: ld.param.b64 %rd1, [trunc_v2i32_param_0]; |
| ; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd1; |
| ; CHECK-SM100-NEXT: mov.b32 %r3, 0; |
| ; CHECK-SM100-NEXT: prmt.b32 %r4, %r3, 0, 0x3340U; |
| ; CHECK-SM100-NEXT: prmt.b32 %r5, %r1, %r2, 0x3340U; |
| ; CHECK-SM100-NEXT: prmt.b32 %r6, %r4, %r5, 0x5410U; |
| ; CHECK-SM100-NEXT: mov.b64 %rd2, 0; |
| ; CHECK-SM100-NEXT: st.b32 [%rd2], %r6; |
| ; CHECK-SM100-NEXT: ret; |
| %2 = trunc <2 x i32> %0 to <2 x i8> |
| %3 = shufflevector <2 x i8> zeroinitializer, <2 x i8> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| store <4 x i8> %3, ptr null, align 4 |
| ret void |
| } |
| |
| define ptx_kernel void @zextend_to_v2i32(<2 x i8> %0) { |
| ; CHECK-SM90A-LABEL: zextend_to_v2i32( |
| ; CHECK-SM90A: { |
| ; CHECK-SM90A-NEXT: .reg .b16 %rs<3>; |
| ; CHECK-SM90A-NEXT: .reg .b32 %r<4>; |
| ; CHECK-SM90A-NEXT: .reg .b64 %rd<5>; |
| ; CHECK-SM90A-EMPTY: |
| ; CHECK-SM90A-NEXT: // %bb.0: |
| ; CHECK-SM90A-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [zextend_to_v2i32_param_0]; |
| ; CHECK-SM90A-NEXT: mov.b32 %r1, {%rs1, %rs2}; |
| ; CHECK-SM90A-NEXT: cvt.u32.u16 %r2, %rs1; |
| ; CHECK-SM90A-NEXT: cvt.u32.u16 %r3, %rs2; |
| ; CHECK-SM90A-NEXT: mov.b64 %rd1, 12; |
| ; CHECK-SM90A-NEXT: st.b32 [%rd1], %r3; |
| ; CHECK-SM90A-NEXT: mov.b64 %rd2, 8; |
| ; CHECK-SM90A-NEXT: st.b32 [%rd2], %r2; |
| ; CHECK-SM90A-NEXT: mov.b64 %rd3, 4; |
| ; CHECK-SM90A-NEXT: st.b32 [%rd3], 0; |
| ; CHECK-SM90A-NEXT: mov.b64 %rd4, 0; |
| ; CHECK-SM90A-NEXT: st.b32 [%rd4], 0; |
| ; CHECK-SM90A-NEXT: ret; |
| ; |
| ; CHECK-SM100-LABEL: zextend_to_v2i32( |
| ; CHECK-SM100: { |
| ; CHECK-SM100-NEXT: .reg .b16 %rs<3>; |
| ; CHECK-SM100-NEXT: .reg .b32 %r<5>; |
| ; CHECK-SM100-NEXT: .reg .b64 %rd<8>; |
| ; CHECK-SM100-EMPTY: |
| ; CHECK-SM100-NEXT: // %bb.0: |
| ; CHECK-SM100-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [zextend_to_v2i32_param_0]; |
| ; CHECK-SM100-NEXT: mov.b32 %r1, {%rs1, %rs2}; |
| ; CHECK-SM100-NEXT: cvt.u32.u16 %r2, %rs2; |
| ; CHECK-SM100-NEXT: cvt.u32.u16 %r3, %rs1; |
| ; CHECK-SM100-NEXT: mov.b64 %rd1, {%r3, %r2}; |
| ; CHECK-SM100-NEXT: mov.b32 %r4, 0; |
| ; CHECK-SM100-NEXT: mov.b64 %rd2, {%r4, %r4}; |
| ; CHECK-SM100-NEXT: mov.b64 %rd3, 4; |
| ; CHECK-SM100-NEXT: st.b32 [%rd3], %rd2; |
| ; CHECK-SM100-NEXT: mov.b64 %rd4, 0; |
| ; CHECK-SM100-NEXT: st.b32 [%rd4], %rd2; |
| ; CHECK-SM100-NEXT: mov.b64 %rd5, 8; |
| ; CHECK-SM100-NEXT: st.b32 [%rd5], %rd1; |
| ; CHECK-SM100-NEXT: shr.u64 %rd6, %rd1, 32; |
| ; CHECK-SM100-NEXT: mov.b64 %rd7, 12; |
| ; CHECK-SM100-NEXT: st.b32 [%rd7], %rd6; |
| ; CHECK-SM100-NEXT: ret; |
| %2 = zext <2 x i8> %0 to <2 x i32> |
| %3 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| store <4 x i32> %3, ptr null, align 4 |
| ret void |
| } |
| |
| define ptx_kernel void @sextend_to_v2i32(<2 x i8> %0) { |
| ; CHECK-SM90A-LABEL: sextend_to_v2i32( |
| ; CHECK-SM90A: { |
| ; CHECK-SM90A-NEXT: .reg .b16 %rs<3>; |
| ; CHECK-SM90A-NEXT: .reg .b32 %r<6>; |
| ; CHECK-SM90A-NEXT: .reg .b64 %rd<5>; |
| ; CHECK-SM90A-EMPTY: |
| ; CHECK-SM90A-NEXT: // %bb.0: |
| ; CHECK-SM90A-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [sextend_to_v2i32_param_0]; |
| ; CHECK-SM90A-NEXT: mov.b32 %r1, {%rs1, %rs2}; |
| ; CHECK-SM90A-NEXT: cvt.u32.u16 %r2, %rs1; |
| ; CHECK-SM90A-NEXT: cvt.s32.s8 %r3, %r2; |
| ; CHECK-SM90A-NEXT: cvt.u32.u16 %r4, %rs2; |
| ; CHECK-SM90A-NEXT: cvt.s32.s8 %r5, %r4; |
| ; CHECK-SM90A-NEXT: mov.b64 %rd1, 12; |
| ; CHECK-SM90A-NEXT: st.b32 [%rd1], %r5; |
| ; CHECK-SM90A-NEXT: mov.b64 %rd2, 8; |
| ; CHECK-SM90A-NEXT: st.b32 [%rd2], %r3; |
| ; CHECK-SM90A-NEXT: mov.b64 %rd3, 4; |
| ; CHECK-SM90A-NEXT: st.b32 [%rd3], 0; |
| ; CHECK-SM90A-NEXT: mov.b64 %rd4, 0; |
| ; CHECK-SM90A-NEXT: st.b32 [%rd4], 0; |
| ; CHECK-SM90A-NEXT: ret; |
| ; |
| ; CHECK-SM100-LABEL: sextend_to_v2i32( |
| ; CHECK-SM100: { |
| ; CHECK-SM100-NEXT: .reg .b16 %rs<3>; |
| ; CHECK-SM100-NEXT: .reg .b32 %r<7>; |
| ; CHECK-SM100-NEXT: .reg .b64 %rd<8>; |
| ; CHECK-SM100-EMPTY: |
| ; CHECK-SM100-NEXT: // %bb.0: |
| ; CHECK-SM100-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [sextend_to_v2i32_param_0]; |
| ; CHECK-SM100-NEXT: mov.b32 %r1, {%rs1, %rs2}; |
| ; CHECK-SM100-NEXT: cvt.u32.u16 %r2, %rs2; |
| ; CHECK-SM100-NEXT: cvt.s32.s8 %r3, %r2; |
| ; CHECK-SM100-NEXT: cvt.u32.u16 %r4, %rs1; |
| ; CHECK-SM100-NEXT: cvt.s32.s8 %r5, %r4; |
| ; CHECK-SM100-NEXT: mov.b64 %rd1, {%r5, %r3}; |
| ; CHECK-SM100-NEXT: mov.b32 %r6, 0; |
| ; CHECK-SM100-NEXT: mov.b64 %rd2, {%r6, %r6}; |
| ; CHECK-SM100-NEXT: mov.b64 %rd3, 4; |
| ; CHECK-SM100-NEXT: st.b32 [%rd3], %rd2; |
| ; CHECK-SM100-NEXT: mov.b64 %rd4, 0; |
| ; CHECK-SM100-NEXT: st.b32 [%rd4], %rd2; |
| ; CHECK-SM100-NEXT: mov.b64 %rd5, 8; |
| ; CHECK-SM100-NEXT: st.b32 [%rd5], %rd1; |
| ; CHECK-SM100-NEXT: shr.u64 %rd6, %rd1, 32; |
| ; CHECK-SM100-NEXT: mov.b64 %rd7, 12; |
| ; CHECK-SM100-NEXT: st.b32 [%rd7], %rd6; |
| ; CHECK-SM100-NEXT: ret; |
| %2 = sext <2 x i8> %0 to <2 x i32> |
| %3 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| store <4 x i32> %3, ptr null, align 4 |
| ret void |
| } |
| ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: |
| ; CHECK: {{.*}} |