|  | ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 | 
|  | ; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes IR,IRC | 
|  | ; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-nvcl | FileCheck %s --check-prefixes IR,IRO | 
|  | ; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX,PTXC | 
|  | ; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-nvcl| FileCheck %s --check-prefixes PTX,PTXO | 
|  | ; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %} | 
|  |  | 
|  | target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" | 
|  | target triple = "nvptx64-nvidia-cuda" | 
|  |  | 
|  | %class.outer = type <{ %class.inner, i32, [4 x i8] }> | 
|  | %class.inner = type { ptr, ptr } | 
|  | %class.padded = type { i8, i32 } | 
|  |  | 
|  | ; Check that nvptx-lower-args preserves arg alignment | 
|  | define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %arg) { | 
|  | ; IR-LABEL: define void @load_alignment( | 
|  | ; IR-SAME: ptr readonly byval([[CLASS_OUTER:%.*]]) align 8 captures(none) [[ARG:%.*]]) { | 
|  | ; IR-NEXT:  [[ENTRY:.*:]] | 
|  | ; IR-NEXT:    [[ARG_IDX_VAL:%.*]] = load ptr, ptr [[ARG]], align 8 | 
|  | ; IR-NEXT:    [[ARG_IDX1:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG]], i64 0, i32 0, i32 1 | 
|  | ; IR-NEXT:    [[ARG_IDX1_VAL:%.*]] = load ptr, ptr [[ARG_IDX1]], align 8 | 
|  | ; IR-NEXT:    [[ARG_IDX2:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG]], i64 0, i32 1 | 
|  | ; IR-NEXT:    [[ARG_IDX2_VAL:%.*]] = load i32, ptr [[ARG_IDX2]], align 8 | 
|  | ; IR-NEXT:    [[ARG_IDX_VAL_VAL:%.*]] = load i32, ptr [[ARG_IDX_VAL]], align 4 | 
|  | ; IR-NEXT:    [[ADD_I:%.*]] = add nsw i32 [[ARG_IDX_VAL_VAL]], [[ARG_IDX2_VAL]] | 
|  | ; IR-NEXT:    store i32 [[ADD_I]], ptr [[ARG_IDX1_VAL]], align 4 | 
|  | ; IR-NEXT:    [[TMP:%.*]] = call ptr @escape(ptr nonnull [[ARG_IDX2]]) | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | ; PTX-LABEL: load_alignment( | 
|  | ; PTX:       { | 
|  | ; PTX-NEXT:    .reg .b32 %r<4>; | 
|  | ; PTX-NEXT:    .reg .b64 %rd<8>; | 
|  | ; PTX-EMPTY: | 
|  | ; PTX-NEXT:  // %bb.0: // %entry | 
|  | ; PTX-NEXT:    mov.b64 %rd1, load_alignment_param_0; | 
|  | ; PTX-NEXT:    ld.local.b64 %rd2, [%rd1]; | 
|  | ; PTX-NEXT:    ld.local.b64 %rd3, [%rd1+8]; | 
|  | ; PTX-NEXT:    add.s64 %rd4, %rd1, 16; | 
|  | ; PTX-NEXT:    cvta.local.u64 %rd5, %rd4; | 
|  | ; PTX-NEXT:    ld.local.b32 %r1, [%rd1+16]; | 
|  | ; PTX-NEXT:    ld.b32 %r2, [%rd2]; | 
|  | ; PTX-NEXT:    add.s32 %r3, %r2, %r1; | 
|  | ; PTX-NEXT:    st.b32 [%rd3], %r3; | 
|  | ; PTX-NEXT:    { // callseq 0, 0 | 
|  | ; PTX-NEXT:    .param .b64 param0; | 
|  | ; PTX-NEXT:    st.param.b64 [param0], %rd5; | 
|  | ; PTX-NEXT:    .param .b64 retval0; | 
|  | ; PTX-NEXT:    call.uni (retval0), | 
|  | ; PTX-NEXT:    escape, | 
|  | ; PTX-NEXT:    ( | 
|  | ; PTX-NEXT:    param0 | 
|  | ; PTX-NEXT:    ); | 
|  | ; PTX-NEXT:    ld.param.b64 %rd6, [retval0]; | 
|  | ; PTX-NEXT:    } // callseq 0 | 
|  | ; PTX-NEXT:    ret; | 
|  | entry: | 
|  | %arg.idx.val = load ptr, ptr %arg, align 8 | 
|  | %arg.idx1 = getelementptr %class.outer, ptr %arg, i64 0, i32 0, i32 1 | 
|  | %arg.idx1.val = load ptr, ptr %arg.idx1, align 8 | 
|  | %arg.idx2 = getelementptr %class.outer, ptr %arg, i64 0, i32 1 | 
|  | %arg.idx2.val = load i32, ptr %arg.idx2, align 8 | 
|  | %arg.idx.val.val = load i32, ptr %arg.idx.val, align 4 | 
|  | %add.i = add nsw i32 %arg.idx.val.val, %arg.idx2.val | 
|  | store i32 %add.i, ptr %arg.idx1.val, align 4 | 
|  |  | 
|  | ; let the pointer escape so we still create a local copy this test uses to | 
|  | ; check the load alignment. | 
|  | %tmp = call ptr @escape(ptr nonnull %arg.idx2) | 
|  | ret void | 
|  | } | 
|  |  | 
|  | ; Check that nvptx-lower-args copies padding as the struct may have been a union | 
|  | define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) { | 
|  | ; IR-LABEL: define void @load_padding( | 
|  | ; IR-SAME: ptr readonly byval([[CLASS_PADDED:%.*]]) align 4 captures(none) [[ARG:%.*]]) { | 
|  | ; IR-NEXT:    [[TMP:%.*]] = call ptr @escape(ptr nonnull align 16 [[ARG]]) | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | ; PTX-LABEL: load_padding( | 
|  | ; PTX:       { | 
|  | ; PTX-NEXT:    .reg .b64 %rd<5>; | 
|  | ; PTX-EMPTY: | 
|  | ; PTX-NEXT:  // %bb.0: | 
|  | ; PTX-NEXT:    mov.b64 %rd1, load_padding_param_0; | 
|  | ; PTX-NEXT:    cvta.local.u64 %rd2, %rd1; | 
|  | ; PTX-NEXT:    { // callseq 1, 0 | 
|  | ; PTX-NEXT:    .param .b64 param0; | 
|  | ; PTX-NEXT:    st.param.b64 [param0], %rd2; | 
|  | ; PTX-NEXT:    .param .b64 retval0; | 
|  | ; PTX-NEXT:    call.uni (retval0), | 
|  | ; PTX-NEXT:    escape, | 
|  | ; PTX-NEXT:    ( | 
|  | ; PTX-NEXT:    param0 | 
|  | ; PTX-NEXT:    ); | 
|  | ; PTX-NEXT:    ld.param.b64 %rd3, [retval0]; | 
|  | ; PTX-NEXT:    } // callseq 1 | 
|  | ; PTX-NEXT:    ret; | 
|  | %tmp = call ptr @escape(ptr nonnull align 16 %arg) | 
|  | ret void | 
|  | } | 
|  |  | 
|  | ; OpenCL can't make assumptions about incoming pointer, so we should generate | 
|  | ; generic pointers load/store. | 
|  | define ptx_kernel void @ptr_generic(ptr %out, ptr %in) { | 
|  | ; IR-LABEL: define ptx_kernel void @ptr_generic( | 
|  | ; IR-SAME: ptr [[OUT:%.*]], ptr [[IN:%.*]]) { | 
|  | ; IR-NEXT:    [[V:%.*]] = load i32, ptr [[IN]], align 4 | 
|  | ; IR-NEXT:    store i32 [[V]], ptr [[OUT]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | ; PTXC-LABEL: ptr_generic( | 
|  | ; PTXC:       { | 
|  | ; PTXC-NEXT:    .reg .b32 %r<2>; | 
|  | ; PTXC-NEXT:    .reg .b64 %rd<5>; | 
|  | ; PTXC-EMPTY: | 
|  | ; PTXC-NEXT:  // %bb.0: | 
|  | ; PTXC-NEXT:    ld.param.b64 %rd1, [ptr_generic_param_0]; | 
|  | ; PTXC-NEXT:    ld.param.b64 %rd2, [ptr_generic_param_1]; | 
|  | ; PTXC-NEXT:    cvta.to.global.u64 %rd3, %rd2; | 
|  | ; PTXC-NEXT:    cvta.to.global.u64 %rd4, %rd1; | 
|  | ; PTXC-NEXT:    ld.global.b32 %r1, [%rd3]; | 
|  | ; PTXC-NEXT:    st.global.b32 [%rd4], %r1; | 
|  | ; PTXC-NEXT:    ret; | 
|  | ; | 
|  | ; PTXO-LABEL: ptr_generic( | 
|  | ; PTXO:       { | 
|  | ; PTXO-NEXT:    .reg .b32 %r<2>; | 
|  | ; PTXO-NEXT:    .reg .b64 %rd<3>; | 
|  | ; PTXO-EMPTY: | 
|  | ; PTXO-NEXT:  // %bb.0: | 
|  | ; PTXO-NEXT:    ld.param.b64 %rd1, [ptr_generic_param_0]; | 
|  | ; PTXO-NEXT:    ld.param.b64 %rd2, [ptr_generic_param_1]; | 
|  | ; PTXO-NEXT:    ld.b32 %r1, [%rd2]; | 
|  | ; PTXO-NEXT:    st.b32 [%rd1], %r1; | 
|  | ; PTXO-NEXT:    ret; | 
|  | %v = load i32, ptr  %in, align 4 | 
|  | store i32 %v, ptr %out, align 4 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) { | 
|  | ; IR-LABEL: define ptx_kernel void @ptr_nongeneric( | 
|  | ; IR-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[IN:%.*]]) { | 
|  | ; IR-NEXT:    [[V:%.*]] = load i32, ptr addrspace(3) [[IN]], align 4 | 
|  | ; IR-NEXT:    store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | ; PTX-LABEL: ptr_nongeneric( | 
|  | ; PTX:       { | 
|  | ; PTX-NEXT:    .reg .b32 %r<2>; | 
|  | ; PTX-NEXT:    .reg .b64 %rd<3>; | 
|  | ; PTX-EMPTY: | 
|  | ; PTX-NEXT:  // %bb.0: | 
|  | ; PTX-NEXT:    ld.param.b64 %rd1, [ptr_nongeneric_param_0]; | 
|  | ; PTX-NEXT:    ld.param.b64 %rd2, [ptr_nongeneric_param_1]; | 
|  | ; PTX-NEXT:    ld.shared.b32 %r1, [%rd2]; | 
|  | ; PTX-NEXT:    st.global.b32 [%rd1], %r1; | 
|  | ; PTX-NEXT:    ret; | 
|  | %v = load i32, ptr addrspace(3) %in, align 4 | 
|  | store i32 %v, ptr addrspace(1) %out, align 4 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) { | 
|  | ; IRC-LABEL: define ptx_kernel void @ptr_as_int( | 
|  | ; IRC-SAME: i64 noundef [[I:%.*]], i32 noundef [[V:%.*]]) { | 
|  | ; IRC-NEXT:    [[P:%.*]] = inttoptr i64 [[I]] to ptr | 
|  | ; IRC-NEXT:    [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1) | 
|  | ; IRC-NEXT:    [[P2:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr | 
|  | ; IRC-NEXT:    store i32 [[V]], ptr [[P2]], align 4 | 
|  | ; IRC-NEXT:    ret void | 
|  | ; | 
|  | ; IRO-LABEL: define ptx_kernel void @ptr_as_int( | 
|  | ; IRO-SAME: i64 noundef [[I:%.*]], i32 noundef [[V:%.*]]) { | 
|  | ; IRO-NEXT:    [[P:%.*]] = inttoptr i64 [[I]] to ptr | 
|  | ; IRO-NEXT:    store i32 [[V]], ptr [[P]], align 4 | 
|  | ; IRO-NEXT:    ret void | 
|  | ; | 
|  | ; PTXC-LABEL: ptr_as_int( | 
|  | ; PTXC:       { | 
|  | ; PTXC-NEXT:    .reg .b32 %r<2>; | 
|  | ; PTXC-NEXT:    .reg .b64 %rd<3>; | 
|  | ; PTXC-EMPTY: | 
|  | ; PTXC-NEXT:  // %bb.0: | 
|  | ; PTXC-NEXT:    ld.param.b64 %rd1, [ptr_as_int_param_0]; | 
|  | ; PTXC-NEXT:    ld.param.b32 %r1, [ptr_as_int_param_1]; | 
|  | ; PTXC-NEXT:    cvta.to.global.u64 %rd2, %rd1; | 
|  | ; PTXC-NEXT:    st.global.b32 [%rd2], %r1; | 
|  | ; PTXC-NEXT:    ret; | 
|  | ; | 
|  | ; PTXO-LABEL: ptr_as_int( | 
|  | ; PTXO:       { | 
|  | ; PTXO-NEXT:    .reg .b32 %r<2>; | 
|  | ; PTXO-NEXT:    .reg .b64 %rd<2>; | 
|  | ; PTXO-EMPTY: | 
|  | ; PTXO-NEXT:  // %bb.0: | 
|  | ; PTXO-NEXT:    ld.param.b64 %rd1, [ptr_as_int_param_0]; | 
|  | ; PTXO-NEXT:    ld.param.b32 %r1, [ptr_as_int_param_1]; | 
|  | ; PTXO-NEXT:    st.b32 [%rd1], %r1; | 
|  | ; PTXO-NEXT:    ret; | 
|  | %p = inttoptr i64 %i to ptr | 
|  | store i32 %v, ptr %p, align 4 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | %struct.S = type { i64 } | 
|  |  | 
|  | define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%struct.S) align 8 %s, i32 noundef %v) { | 
|  | ; IRC-LABEL: define ptx_kernel void @ptr_as_int_aggr( | 
|  | ; IRC-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) { | 
|  | ; IRC-NEXT:    [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) | 
|  | ; IRC-NEXT:    [[I:%.*]] = load i64, ptr addrspace(101) [[S3]], align 8 | 
|  | ; IRC-NEXT:    [[P:%.*]] = inttoptr i64 [[I]] to ptr | 
|  | ; IRC-NEXT:    [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1) | 
|  | ; IRC-NEXT:    [[P2:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr | 
|  | ; IRC-NEXT:    store i32 [[V]], ptr [[P2]], align 4 | 
|  | ; IRC-NEXT:    ret void | 
|  | ; | 
|  | ; IRO-LABEL: define ptx_kernel void @ptr_as_int_aggr( | 
|  | ; IRO-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) { | 
|  | ; IRO-NEXT:    [[S1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) | 
|  | ; IRO-NEXT:    [[I:%.*]] = load i64, ptr addrspace(101) [[S1]], align 8 | 
|  | ; IRO-NEXT:    [[P:%.*]] = inttoptr i64 [[I]] to ptr | 
|  | ; IRO-NEXT:    store i32 [[V]], ptr [[P]], align 4 | 
|  | ; IRO-NEXT:    ret void | 
|  | ; | 
|  | ; PTXC-LABEL: ptr_as_int_aggr( | 
|  | ; PTXC:       { | 
|  | ; PTXC-NEXT:    .reg .b32 %r<2>; | 
|  | ; PTXC-NEXT:    .reg .b64 %rd<3>; | 
|  | ; PTXC-EMPTY: | 
|  | ; PTXC-NEXT:  // %bb.0: | 
|  | ; PTXC-NEXT:    ld.param.b32 %r1, [ptr_as_int_aggr_param_1]; | 
|  | ; PTXC-NEXT:    ld.param.b64 %rd1, [ptr_as_int_aggr_param_0]; | 
|  | ; PTXC-NEXT:    cvta.to.global.u64 %rd2, %rd1; | 
|  | ; PTXC-NEXT:    st.global.b32 [%rd2], %r1; | 
|  | ; PTXC-NEXT:    ret; | 
|  | ; | 
|  | ; PTXO-LABEL: ptr_as_int_aggr( | 
|  | ; PTXO:       { | 
|  | ; PTXO-NEXT:    .reg .b32 %r<2>; | 
|  | ; PTXO-NEXT:    .reg .b64 %rd<2>; | 
|  | ; PTXO-EMPTY: | 
|  | ; PTXO-NEXT:  // %bb.0: | 
|  | ; PTXO-NEXT:    ld.param.b32 %r1, [ptr_as_int_aggr_param_1]; | 
|  | ; PTXO-NEXT:    ld.param.b64 %rd1, [ptr_as_int_aggr_param_0]; | 
|  | ; PTXO-NEXT:    st.b32 [%rd1], %r1; | 
|  | ; PTXO-NEXT:    ret; | 
|  | %i = load i64, ptr %s, align 8 | 
|  | %p = inttoptr i64 %i to ptr | 
|  | store i32 %v, ptr %p, align 4 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | ; Function Attrs: convergent nounwind | 
|  | declare dso_local ptr @escape(ptr) local_unnamed_addr |