| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=PTX32 |
| ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=PTX64 |
| ; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} |
| ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} |
| |
| ; Ensure we access the local stack properly |
| |
| define void @foo(i32 %a) { |
| ; PTX32-LABEL: foo( |
| ; PTX32: { |
| ; PTX32-NEXT: .local .align 4 .b8 __local_depot0[4]; |
| ; PTX32-NEXT: .reg .b32 %SP; |
| ; PTX32-NEXT: .reg .b32 %SPL; |
| ; PTX32-NEXT: .reg .b32 %r<4>; |
| ; PTX32-EMPTY: |
| ; PTX32-NEXT: // %bb.0: |
| ; PTX32-NEXT: mov.b32 %SPL, __local_depot0; |
| ; PTX32-NEXT: ld.param.b32 %r1, [foo_param_0]; |
| ; PTX32-NEXT: add.u32 %r3, %SPL, 0; |
| ; PTX32-NEXT: st.local.b32 [%r3], %r1; |
| ; PTX32-NEXT: ret; |
| ; |
| ; PTX64-LABEL: foo( |
| ; PTX64: { |
| ; PTX64-NEXT: .local .align 4 .b8 __local_depot0[4]; |
| ; PTX64-NEXT: .reg .b64 %SP; |
| ; PTX64-NEXT: .reg .b64 %SPL; |
| ; PTX64-NEXT: .reg .b32 %r<2>; |
| ; PTX64-NEXT: .reg .b64 %rd<3>; |
| ; PTX64-EMPTY: |
| ; PTX64-NEXT: // %bb.0: |
| ; PTX64-NEXT: mov.b64 %SPL, __local_depot0; |
| ; PTX64-NEXT: ld.param.b32 %r1, [foo_param_0]; |
| ; PTX64-NEXT: add.u64 %rd2, %SPL, 0; |
| ; PTX64-NEXT: st.local.b32 [%rd2], %r1; |
| ; PTX64-NEXT: ret; |
| %local = alloca i32, align 4 |
| store volatile i32 %a, ptr %local |
| ret void |
| } |
| |
| define ptx_kernel void @foo2(i32 %a) { |
| ; PTX32-LABEL: foo2( |
| ; PTX32: { |
| ; PTX32-NEXT: .local .align 4 .b8 __local_depot1[4]; |
| ; PTX32-NEXT: .reg .b32 %SP; |
| ; PTX32-NEXT: .reg .b32 %SPL; |
| ; PTX32-NEXT: .reg .b32 %r<4>; |
| ; PTX32-EMPTY: |
| ; PTX32-NEXT: // %bb.0: |
| ; PTX32-NEXT: mov.b32 %SPL, __local_depot1; |
| ; PTX32-NEXT: cvta.local.u32 %SP, %SPL; |
| ; PTX32-NEXT: ld.param.b32 %r1, [foo2_param_0]; |
| ; PTX32-NEXT: add.u32 %r2, %SP, 0; |
| ; PTX32-NEXT: add.u32 %r3, %SPL, 0; |
| ; PTX32-NEXT: st.local.b32 [%r3], %r1; |
| ; PTX32-NEXT: { // callseq 0, 0 |
| ; PTX32-NEXT: .param .b32 param0; |
| ; PTX32-NEXT: st.param.b32 [param0], %r2; |
| ; PTX32-NEXT: call.uni bar, (param0); |
| ; PTX32-NEXT: } // callseq 0 |
| ; PTX32-NEXT: ret; |
| ; |
| ; PTX64-LABEL: foo2( |
| ; PTX64: { |
| ; PTX64-NEXT: .local .align 4 .b8 __local_depot1[4]; |
| ; PTX64-NEXT: .reg .b64 %SP; |
| ; PTX64-NEXT: .reg .b64 %SPL; |
| ; PTX64-NEXT: .reg .b32 %r<2>; |
| ; PTX64-NEXT: .reg .b64 %rd<3>; |
| ; PTX64-EMPTY: |
| ; PTX64-NEXT: // %bb.0: |
| ; PTX64-NEXT: mov.b64 %SPL, __local_depot1; |
| ; PTX64-NEXT: cvta.local.u64 %SP, %SPL; |
| ; PTX64-NEXT: ld.param.b32 %r1, [foo2_param_0]; |
| ; PTX64-NEXT: add.u64 %rd1, %SP, 0; |
| ; PTX64-NEXT: add.u64 %rd2, %SPL, 0; |
| ; PTX64-NEXT: st.local.b32 [%rd2], %r1; |
| ; PTX64-NEXT: { // callseq 0, 0 |
| ; PTX64-NEXT: .param .b64 param0; |
| ; PTX64-NEXT: st.param.b64 [param0], %rd1; |
| ; PTX64-NEXT: call.uni bar, (param0); |
| ; PTX64-NEXT: } // callseq 0 |
| ; PTX64-NEXT: ret; |
| %local = alloca i32, align 4 |
| store i32 %a, ptr %local |
| call void @bar(ptr %local) |
| ret void |
| } |
| |
| declare void @bar(ptr %a) |
| |
| define void @foo3(i32 %a) { |
| ; PTX32-LABEL: foo3( |
| ; PTX32: { |
| ; PTX32-NEXT: .local .align 4 .b8 __local_depot2[12]; |
| ; PTX32-NEXT: .reg .b32 %SP; |
| ; PTX32-NEXT: .reg .b32 %SPL; |
| ; PTX32-NEXT: .reg .b32 %r<6>; |
| ; PTX32-EMPTY: |
| ; PTX32-NEXT: // %bb.0: |
| ; PTX32-NEXT: mov.b32 %SPL, __local_depot2; |
| ; PTX32-NEXT: ld.param.b32 %r1, [foo3_param_0]; |
| ; PTX32-NEXT: add.u32 %r3, %SPL, 0; |
| ; PTX32-NEXT: shl.b32 %r4, %r1, 2; |
| ; PTX32-NEXT: add.s32 %r5, %r3, %r4; |
| ; PTX32-NEXT: st.local.b32 [%r5], %r1; |
| ; PTX32-NEXT: ret; |
| ; |
| ; PTX64-LABEL: foo3( |
| ; PTX64: { |
| ; PTX64-NEXT: .local .align 4 .b8 __local_depot2[12]; |
| ; PTX64-NEXT: .reg .b64 %SP; |
| ; PTX64-NEXT: .reg .b64 %SPL; |
| ; PTX64-NEXT: .reg .b32 %r<2>; |
| ; PTX64-NEXT: .reg .b64 %rd<5>; |
| ; PTX64-EMPTY: |
| ; PTX64-NEXT: // %bb.0: |
| ; PTX64-NEXT: mov.b64 %SPL, __local_depot2; |
| ; PTX64-NEXT: ld.param.b32 %r1, [foo3_param_0]; |
| ; PTX64-NEXT: add.u64 %rd2, %SPL, 0; |
| ; PTX64-NEXT: mul.wide.s32 %rd3, %r1, 4; |
| ; PTX64-NEXT: add.s64 %rd4, %rd2, %rd3; |
| ; PTX64-NEXT: st.local.b32 [%rd4], %r1; |
| ; PTX64-NEXT: ret; |
| %local = alloca [3 x i32], align 4 |
| %1 = getelementptr inbounds i32, ptr %local, i32 %a |
| store i32 %a, ptr %1 |
| ret void |
| } |
| |
| define void @foo4() { |
| ; PTX32-LABEL: foo4( |
| ; PTX32: { |
| ; PTX32-NEXT: .local .align 4 .b8 __local_depot3[8]; |
| ; PTX32-NEXT: .reg .b32 %SP; |
| ; PTX32-NEXT: .reg .b32 %SPL; |
| ; PTX32-NEXT: .reg .b32 %r<5>; |
| ; PTX32-EMPTY: |
| ; PTX32-NEXT: // %bb.0: |
| ; PTX32-NEXT: mov.b32 %SPL, __local_depot3; |
| ; PTX32-NEXT: cvta.local.u32 %SP, %SPL; |
| ; PTX32-NEXT: add.u32 %r1, %SP, 0; |
| ; PTX32-NEXT: add.u32 %r2, %SPL, 0; |
| ; PTX32-NEXT: add.u32 %r3, %SP, 4; |
| ; PTX32-NEXT: add.u32 %r4, %SPL, 4; |
| ; PTX32-NEXT: st.local.b32 [%r2], 0; |
| ; PTX32-NEXT: st.local.b32 [%r4], 0; |
| ; PTX32-NEXT: { // callseq 1, 0 |
| ; PTX32-NEXT: .param .b32 param0; |
| ; PTX32-NEXT: st.param.b32 [param0], %r1; |
| ; PTX32-NEXT: call.uni bar, (param0); |
| ; PTX32-NEXT: } // callseq 1 |
| ; PTX32-NEXT: { // callseq 2, 0 |
| ; PTX32-NEXT: .param .b32 param0; |
| ; PTX32-NEXT: st.param.b32 [param0], %r3; |
| ; PTX32-NEXT: call.uni bar, (param0); |
| ; PTX32-NEXT: } // callseq 2 |
| ; PTX32-NEXT: ret; |
| ; |
| ; PTX64-LABEL: foo4( |
| ; PTX64: { |
| ; PTX64-NEXT: .local .align 4 .b8 __local_depot3[8]; |
| ; PTX64-NEXT: .reg .b64 %SP; |
| ; PTX64-NEXT: .reg .b64 %SPL; |
| ; PTX64-NEXT: .reg .b64 %rd<5>; |
| ; PTX64-EMPTY: |
| ; PTX64-NEXT: // %bb.0: |
| ; PTX64-NEXT: mov.b64 %SPL, __local_depot3; |
| ; PTX64-NEXT: cvta.local.u64 %SP, %SPL; |
| ; PTX64-NEXT: add.u64 %rd1, %SP, 0; |
| ; PTX64-NEXT: add.u64 %rd2, %SPL, 0; |
| ; PTX64-NEXT: add.u64 %rd3, %SP, 4; |
| ; PTX64-NEXT: add.u64 %rd4, %SPL, 4; |
| ; PTX64-NEXT: st.local.b32 [%rd2], 0; |
| ; PTX64-NEXT: st.local.b32 [%rd4], 0; |
| ; PTX64-NEXT: { // callseq 1, 0 |
| ; PTX64-NEXT: .param .b64 param0; |
| ; PTX64-NEXT: st.param.b64 [param0], %rd1; |
| ; PTX64-NEXT: call.uni bar, (param0); |
| ; PTX64-NEXT: } // callseq 1 |
| ; PTX64-NEXT: { // callseq 2, 0 |
| ; PTX64-NEXT: .param .b64 param0; |
| ; PTX64-NEXT: st.param.b64 [param0], %rd3; |
| ; PTX64-NEXT: call.uni bar, (param0); |
| ; PTX64-NEXT: } // callseq 2 |
| ; PTX64-NEXT: ret; |
| %A = alloca i32 |
| %B = alloca i32 |
| store i32 0, ptr %A |
| store i32 0, ptr %B |
| call void @bar(ptr %A) |
| call void @bar(ptr %B) |
| ret void |
| } |