| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes IR,IRC |
| ; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-nvcl | FileCheck %s --check-prefixes IR,IRO |
| ; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX,PTXC |
| ; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-nvcl| FileCheck %s --check-prefixes PTX,PTXO |
| ; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %} |
| |
| target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" |
| target triple = "nvptx64-nvidia-cuda" |
| |
| %class.outer = type <{ %class.inner, i32, [4 x i8] }> |
| %class.inner = type { ptr, ptr } |
| %class.padded = type { i8, i32 } |
| |
| ; Check that nvptx-lower-args preserves arg alignment |
| define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %arg) { |
| ; IR-LABEL: define void @load_alignment( |
| ; IR-SAME: ptr readonly byval([[CLASS_OUTER:%.*]]) align 8 captures(none) [[ARG:%.*]]) { |
| ; IR-NEXT: [[ENTRY:.*:]] |
| ; IR-NEXT: [[ARG_IDX_VAL:%.*]] = load ptr, ptr [[ARG]], align 8 |
| ; IR-NEXT: [[ARG_IDX1:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG]], i64 0, i32 0, i32 1 |
| ; IR-NEXT: [[ARG_IDX1_VAL:%.*]] = load ptr, ptr [[ARG_IDX1]], align 8 |
| ; IR-NEXT: [[ARG_IDX2:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG]], i64 0, i32 1 |
| ; IR-NEXT: [[ARG_IDX2_VAL:%.*]] = load i32, ptr [[ARG_IDX2]], align 8 |
| ; IR-NEXT: [[ARG_IDX_VAL_VAL:%.*]] = load i32, ptr [[ARG_IDX_VAL]], align 4 |
| ; IR-NEXT: [[ADD_I:%.*]] = add nsw i32 [[ARG_IDX_VAL_VAL]], [[ARG_IDX2_VAL]] |
| ; IR-NEXT: store i32 [[ADD_I]], ptr [[ARG_IDX1_VAL]], align 4 |
| ; IR-NEXT: [[TMP:%.*]] = call ptr @escape(ptr nonnull [[ARG_IDX2]]) |
| ; IR-NEXT: ret void |
| ; |
| ; PTX-LABEL: load_alignment( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<4>; |
| ; PTX-NEXT: .reg .b64 %rd<8>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: // %entry |
| ; PTX-NEXT: mov.b64 %rd1, load_alignment_param_0; |
| ; PTX-NEXT: ld.local.u64 %rd2, [%rd1]; |
| ; PTX-NEXT: ld.local.u64 %rd3, [%rd1+8]; |
| ; PTX-NEXT: add.s64 %rd4, %rd1, 16; |
| ; PTX-NEXT: cvta.local.u64 %rd5, %rd4; |
| ; PTX-NEXT: ld.local.u32 %r1, [%rd1+16]; |
| ; PTX-NEXT: ld.u32 %r2, [%rd2]; |
| ; PTX-NEXT: add.s32 %r3, %r2, %r1; |
| ; PTX-NEXT: st.u32 [%rd3], %r3; |
| ; PTX-NEXT: { // callseq 0, 0 |
| ; PTX-NEXT: .param .b64 param0; |
| ; PTX-NEXT: st.param.b64 [param0], %rd5; |
| ; PTX-NEXT: .param .b64 retval0; |
| ; PTX-NEXT: call.uni (retval0), |
| ; PTX-NEXT: escape, |
| ; PTX-NEXT: ( |
| ; PTX-NEXT: param0 |
| ; PTX-NEXT: ); |
| ; PTX-NEXT: ld.param.b64 %rd6, [retval0]; |
| ; PTX-NEXT: } // callseq 0 |
| ; PTX-NEXT: ret; |
| entry: |
| %arg.idx.val = load ptr, ptr %arg, align 8 |
| %arg.idx1 = getelementptr %class.outer, ptr %arg, i64 0, i32 0, i32 1 |
| %arg.idx1.val = load ptr, ptr %arg.idx1, align 8 |
| %arg.idx2 = getelementptr %class.outer, ptr %arg, i64 0, i32 1 |
| %arg.idx2.val = load i32, ptr %arg.idx2, align 8 |
| %arg.idx.val.val = load i32, ptr %arg.idx.val, align 4 |
| %add.i = add nsw i32 %arg.idx.val.val, %arg.idx2.val |
| store i32 %add.i, ptr %arg.idx1.val, align 4 |
| |
| ; let the pointer escape so we still create a local copy this test uses to |
| ; check the load alignment. |
| %tmp = call ptr @escape(ptr nonnull %arg.idx2) |
| ret void |
| } |
| |
| ; Check that nvptx-lower-args copies padding as the struct may have been a union |
| define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) { |
| ; IR-LABEL: define void @load_padding( |
| ; IR-SAME: ptr readonly byval([[CLASS_PADDED:%.*]]) align 4 captures(none) [[ARG:%.*]]) { |
| ; IR-NEXT: [[TMP:%.*]] = call ptr @escape(ptr nonnull align 16 [[ARG]]) |
| ; IR-NEXT: ret void |
| ; |
| ; PTX-LABEL: load_padding( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b64 %rd<5>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: mov.b64 %rd1, load_padding_param_0; |
| ; PTX-NEXT: cvta.local.u64 %rd2, %rd1; |
| ; PTX-NEXT: { // callseq 1, 0 |
| ; PTX-NEXT: .param .b64 param0; |
| ; PTX-NEXT: st.param.b64 [param0], %rd2; |
| ; PTX-NEXT: .param .b64 retval0; |
| ; PTX-NEXT: call.uni (retval0), |
| ; PTX-NEXT: escape, |
| ; PTX-NEXT: ( |
| ; PTX-NEXT: param0 |
| ; PTX-NEXT: ); |
| ; PTX-NEXT: ld.param.b64 %rd3, [retval0]; |
| ; PTX-NEXT: } // callseq 1 |
| ; PTX-NEXT: ret; |
| %tmp = call ptr @escape(ptr nonnull align 16 %arg) |
| ret void |
| } |
| |
| ; OpenCL can't make assumptions about incoming pointer, so we should generate |
| ; generic pointers load/store. |
| define ptx_kernel void @ptr_generic(ptr %out, ptr %in) { |
| ; IR-LABEL: define ptx_kernel void @ptr_generic( |
| ; IR-SAME: ptr [[OUT:%.*]], ptr [[IN:%.*]]) { |
| ; IR-NEXT: [[V:%.*]] = load i32, ptr [[IN]], align 4 |
| ; IR-NEXT: store i32 [[V]], ptr [[OUT]], align 4 |
| ; IR-NEXT: ret void |
| ; |
| ; PTXC-LABEL: ptr_generic( |
| ; PTXC: { |
| ; PTXC-NEXT: .reg .b32 %r<2>; |
| ; PTXC-NEXT: .reg .b64 %rd<5>; |
| ; PTXC-EMPTY: |
| ; PTXC-NEXT: // %bb.0: |
| ; PTXC-NEXT: ld.param.u64 %rd1, [ptr_generic_param_0]; |
| ; PTXC-NEXT: ld.param.u64 %rd2, [ptr_generic_param_1]; |
| ; PTXC-NEXT: cvta.to.global.u64 %rd3, %rd2; |
| ; PTXC-NEXT: cvta.to.global.u64 %rd4, %rd1; |
| ; PTXC-NEXT: ld.global.u32 %r1, [%rd3]; |
| ; PTXC-NEXT: st.global.u32 [%rd4], %r1; |
| ; PTXC-NEXT: ret; |
| ; |
| ; PTXO-LABEL: ptr_generic( |
| ; PTXO: { |
| ; PTXO-NEXT: .reg .b32 %r<2>; |
| ; PTXO-NEXT: .reg .b64 %rd<3>; |
| ; PTXO-EMPTY: |
| ; PTXO-NEXT: // %bb.0: |
| ; PTXO-NEXT: ld.param.u64 %rd1, [ptr_generic_param_0]; |
| ; PTXO-NEXT: ld.param.u64 %rd2, [ptr_generic_param_1]; |
| ; PTXO-NEXT: ld.u32 %r1, [%rd2]; |
| ; PTXO-NEXT: st.u32 [%rd1], %r1; |
| ; PTXO-NEXT: ret; |
| %v = load i32, ptr %in, align 4 |
| store i32 %v, ptr %out, align 4 |
| ret void |
| } |
| |
| define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) { |
| ; IR-LABEL: define ptx_kernel void @ptr_nongeneric( |
| ; IR-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[IN:%.*]]) { |
| ; IR-NEXT: [[V:%.*]] = load i32, ptr addrspace(3) [[IN]], align 4 |
| ; IR-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 |
| ; IR-NEXT: ret void |
| ; |
| ; PTX-LABEL: ptr_nongeneric( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<2>; |
| ; PTX-NEXT: .reg .b64 %rd<3>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.param.u64 %rd1, [ptr_nongeneric_param_0]; |
| ; PTX-NEXT: ld.param.u64 %rd2, [ptr_nongeneric_param_1]; |
| ; PTX-NEXT: ld.shared.u32 %r1, [%rd2]; |
| ; PTX-NEXT: st.global.u32 [%rd1], %r1; |
| ; PTX-NEXT: ret; |
| %v = load i32, ptr addrspace(3) %in, align 4 |
| store i32 %v, ptr addrspace(1) %out, align 4 |
| ret void |
| } |
| |
| define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) { |
| ; IRC-LABEL: define ptx_kernel void @ptr_as_int( |
| ; IRC-SAME: i64 noundef [[I:%.*]], i32 noundef [[V:%.*]]) { |
| ; IRC-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr |
| ; IRC-NEXT: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1) |
| ; IRC-NEXT: [[P2:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr |
| ; IRC-NEXT: store i32 [[V]], ptr [[P2]], align 4 |
| ; IRC-NEXT: ret void |
| ; |
| ; IRO-LABEL: define ptx_kernel void @ptr_as_int( |
| ; IRO-SAME: i64 noundef [[I:%.*]], i32 noundef [[V:%.*]]) { |
| ; IRO-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr |
| ; IRO-NEXT: store i32 [[V]], ptr [[P]], align 4 |
| ; IRO-NEXT: ret void |
| ; |
| ; PTXC-LABEL: ptr_as_int( |
| ; PTXC: { |
| ; PTXC-NEXT: .reg .b32 %r<2>; |
| ; PTXC-NEXT: .reg .b64 %rd<3>; |
| ; PTXC-EMPTY: |
| ; PTXC-NEXT: // %bb.0: |
| ; PTXC-NEXT: ld.param.u64 %rd1, [ptr_as_int_param_0]; |
| ; PTXC-NEXT: ld.param.u32 %r1, [ptr_as_int_param_1]; |
| ; PTXC-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; PTXC-NEXT: st.global.u32 [%rd2], %r1; |
| ; PTXC-NEXT: ret; |
| ; |
| ; PTXO-LABEL: ptr_as_int( |
| ; PTXO: { |
| ; PTXO-NEXT: .reg .b32 %r<2>; |
| ; PTXO-NEXT: .reg .b64 %rd<2>; |
| ; PTXO-EMPTY: |
| ; PTXO-NEXT: // %bb.0: |
| ; PTXO-NEXT: ld.param.u64 %rd1, [ptr_as_int_param_0]; |
| ; PTXO-NEXT: ld.param.u32 %r1, [ptr_as_int_param_1]; |
| ; PTXO-NEXT: st.u32 [%rd1], %r1; |
| ; PTXO-NEXT: ret; |
| %p = inttoptr i64 %i to ptr |
| store i32 %v, ptr %p, align 4 |
| ret void |
| } |
| |
| %struct.S = type { i64 } |
| |
| define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%struct.S) align 8 %s, i32 noundef %v) { |
| ; IRC-LABEL: define ptx_kernel void @ptr_as_int_aggr( |
| ; IRC-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) { |
| ; IRC-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) |
| ; IRC-NEXT: [[I:%.*]] = load i64, ptr addrspace(101) [[S3]], align 8 |
| ; IRC-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr |
| ; IRC-NEXT: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1) |
| ; IRC-NEXT: [[P2:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr |
| ; IRC-NEXT: store i32 [[V]], ptr [[P2]], align 4 |
| ; IRC-NEXT: ret void |
| ; |
| ; IRO-LABEL: define ptx_kernel void @ptr_as_int_aggr( |
| ; IRO-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) { |
| ; IRO-NEXT: [[S1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) |
| ; IRO-NEXT: [[I:%.*]] = load i64, ptr addrspace(101) [[S1]], align 8 |
| ; IRO-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr |
| ; IRO-NEXT: store i32 [[V]], ptr [[P]], align 4 |
| ; IRO-NEXT: ret void |
| ; |
| ; PTXC-LABEL: ptr_as_int_aggr( |
| ; PTXC: { |
| ; PTXC-NEXT: .reg .b32 %r<2>; |
| ; PTXC-NEXT: .reg .b64 %rd<3>; |
| ; PTXC-EMPTY: |
| ; PTXC-NEXT: // %bb.0: |
| ; PTXC-NEXT: ld.param.u32 %r1, [ptr_as_int_aggr_param_1]; |
| ; PTXC-NEXT: ld.param.u64 %rd1, [ptr_as_int_aggr_param_0]; |
| ; PTXC-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; PTXC-NEXT: st.global.u32 [%rd2], %r1; |
| ; PTXC-NEXT: ret; |
| ; |
| ; PTXO-LABEL: ptr_as_int_aggr( |
| ; PTXO: { |
| ; PTXO-NEXT: .reg .b32 %r<2>; |
| ; PTXO-NEXT: .reg .b64 %rd<2>; |
| ; PTXO-EMPTY: |
| ; PTXO-NEXT: // %bb.0: |
| ; PTXO-NEXT: ld.param.u32 %r1, [ptr_as_int_aggr_param_1]; |
| ; PTXO-NEXT: ld.param.u64 %rd1, [ptr_as_int_aggr_param_0]; |
| ; PTXO-NEXT: st.u32 [%rd1], %r1; |
| ; PTXO-NEXT: ret; |
| %i = load i64, ptr %s, align 8 |
| %p = inttoptr i64 %i to ptr |
| store i32 %v, ptr %p, align 4 |
| ret void |
| } |
| |
| ; Function Attrs: convergent nounwind |
| declare dso_local ptr @escape(ptr) local_unnamed_addr |