blob: 246408ecf6a3ad292fc3ba328dc60b58ecc0e78b [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes IR,IRC
; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-nvcl | FileCheck %s --check-prefixes IR,IRO
; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX,PTXC
; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-nvcl| FileCheck %s --check-prefixes PTX,PTXO
; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %}
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%class.outer = type <{ %class.inner, i32, [4 x i8] }>
%class.inner = type { ptr, ptr }
%class.padded = type { i8, i32 }
; Check that nvptx-lower-args preserves arg alignment
define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %arg) {
; IR-LABEL: define void @load_alignment(
; IR-SAME: ptr readonly byval([[CLASS_OUTER:%.*]]) align 8 captures(none) [[ARG:%.*]]) {
; IR-NEXT: [[ENTRY:.*:]]
; IR-NEXT: [[ARG_IDX_VAL:%.*]] = load ptr, ptr [[ARG]], align 8
; IR-NEXT: [[ARG_IDX1:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG]], i64 0, i32 0, i32 1
; IR-NEXT: [[ARG_IDX1_VAL:%.*]] = load ptr, ptr [[ARG_IDX1]], align 8
; IR-NEXT: [[ARG_IDX2:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG]], i64 0, i32 1
; IR-NEXT: [[ARG_IDX2_VAL:%.*]] = load i32, ptr [[ARG_IDX2]], align 8
; IR-NEXT: [[ARG_IDX_VAL_VAL:%.*]] = load i32, ptr [[ARG_IDX_VAL]], align 4
; IR-NEXT: [[ADD_I:%.*]] = add nsw i32 [[ARG_IDX_VAL_VAL]], [[ARG_IDX2_VAL]]
; IR-NEXT: store i32 [[ADD_I]], ptr [[ARG_IDX1_VAL]], align 4
; IR-NEXT: [[TMP:%.*]] = call ptr @escape(ptr nonnull [[ARG_IDX2]])
; IR-NEXT: ret void
;
; PTX-LABEL: load_alignment(
; PTX: {
; PTX-NEXT: .reg .b32 %r<4>;
; PTX-NEXT: .reg .b64 %rd<8>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0: // %entry
; PTX-NEXT: mov.b64 %rd1, load_alignment_param_0;
; PTX-NEXT: ld.local.b64 %rd2, [%rd1];
; PTX-NEXT: ld.local.b64 %rd3, [%rd1+8];
; PTX-NEXT: add.s64 %rd4, %rd1, 16;
; PTX-NEXT: cvta.local.u64 %rd5, %rd4;
; PTX-NEXT: ld.local.b32 %r1, [%rd1+16];
; PTX-NEXT: ld.b32 %r2, [%rd2];
; PTX-NEXT: add.s32 %r3, %r2, %r1;
; PTX-NEXT: st.b32 [%rd3], %r3;
; PTX-NEXT: { // callseq 0, 0
; PTX-NEXT: .param .b64 param0;
; PTX-NEXT: st.param.b64 [param0], %rd5;
; PTX-NEXT: .param .b64 retval0;
; PTX-NEXT: call.uni (retval0),
; PTX-NEXT: escape,
; PTX-NEXT: (
; PTX-NEXT: param0
; PTX-NEXT: );
; PTX-NEXT: ld.param.b64 %rd6, [retval0];
; PTX-NEXT: } // callseq 0
; PTX-NEXT: ret;
entry:
%arg.idx.val = load ptr, ptr %arg, align 8
%arg.idx1 = getelementptr %class.outer, ptr %arg, i64 0, i32 0, i32 1
%arg.idx1.val = load ptr, ptr %arg.idx1, align 8
%arg.idx2 = getelementptr %class.outer, ptr %arg, i64 0, i32 1
%arg.idx2.val = load i32, ptr %arg.idx2, align 8
%arg.idx.val.val = load i32, ptr %arg.idx.val, align 4
%add.i = add nsw i32 %arg.idx.val.val, %arg.idx2.val
store i32 %add.i, ptr %arg.idx1.val, align 4
; let the pointer escape so we still create a local copy this test uses to
; check the load alignment.
%tmp = call ptr @escape(ptr nonnull %arg.idx2)
ret void
}
; Check that nvptx-lower-args copies padding as the struct may have been a union
define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) {
; IR-LABEL: define void @load_padding(
; IR-SAME: ptr readonly byval([[CLASS_PADDED:%.*]]) align 4 captures(none) [[ARG:%.*]]) {
; IR-NEXT: [[TMP:%.*]] = call ptr @escape(ptr nonnull align 16 [[ARG]])
; IR-NEXT: ret void
;
; PTX-LABEL: load_padding(
; PTX: {
; PTX-NEXT: .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd1, load_padding_param_0;
; PTX-NEXT: cvta.local.u64 %rd2, %rd1;
; PTX-NEXT: { // callseq 1, 0
; PTX-NEXT: .param .b64 param0;
; PTX-NEXT: st.param.b64 [param0], %rd2;
; PTX-NEXT: .param .b64 retval0;
; PTX-NEXT: call.uni (retval0),
; PTX-NEXT: escape,
; PTX-NEXT: (
; PTX-NEXT: param0
; PTX-NEXT: );
; PTX-NEXT: ld.param.b64 %rd3, [retval0];
; PTX-NEXT: } // callseq 1
; PTX-NEXT: ret;
%tmp = call ptr @escape(ptr nonnull align 16 %arg)
ret void
}
; OpenCL can't make assumptions about incoming pointer, so we should generate
; generic pointers load/store.
define ptx_kernel void @ptr_generic(ptr %out, ptr %in) {
; IR-LABEL: define ptx_kernel void @ptr_generic(
; IR-SAME: ptr [[OUT:%.*]], ptr [[IN:%.*]]) {
; IR-NEXT: [[V:%.*]] = load i32, ptr [[IN]], align 4
; IR-NEXT: store i32 [[V]], ptr [[OUT]], align 4
; IR-NEXT: ret void
;
; PTXC-LABEL: ptr_generic(
; PTXC: {
; PTXC-NEXT: .reg .b32 %r<2>;
; PTXC-NEXT: .reg .b64 %rd<5>;
; PTXC-EMPTY:
; PTXC-NEXT: // %bb.0:
; PTXC-NEXT: ld.param.b64 %rd1, [ptr_generic_param_0];
; PTXC-NEXT: ld.param.b64 %rd2, [ptr_generic_param_1];
; PTXC-NEXT: cvta.to.global.u64 %rd3, %rd2;
; PTXC-NEXT: cvta.to.global.u64 %rd4, %rd1;
; PTXC-NEXT: ld.global.b32 %r1, [%rd3];
; PTXC-NEXT: st.global.b32 [%rd4], %r1;
; PTXC-NEXT: ret;
;
; PTXO-LABEL: ptr_generic(
; PTXO: {
; PTXO-NEXT: .reg .b32 %r<2>;
; PTXO-NEXT: .reg .b64 %rd<3>;
; PTXO-EMPTY:
; PTXO-NEXT: // %bb.0:
; PTXO-NEXT: ld.param.b64 %rd1, [ptr_generic_param_0];
; PTXO-NEXT: ld.param.b64 %rd2, [ptr_generic_param_1];
; PTXO-NEXT: ld.b32 %r1, [%rd2];
; PTXO-NEXT: st.b32 [%rd1], %r1;
; PTXO-NEXT: ret;
%v = load i32, ptr %in, align 4
store i32 %v, ptr %out, align 4
ret void
}
define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) {
; IR-LABEL: define ptx_kernel void @ptr_nongeneric(
; IR-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[IN:%.*]]) {
; IR-NEXT: [[V:%.*]] = load i32, ptr addrspace(3) [[IN]], align 4
; IR-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; IR-NEXT: ret void
;
; PTX-LABEL: ptr_nongeneric(
; PTX: {
; PTX-NEXT: .reg .b32 %r<2>;
; PTX-NEXT: .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [ptr_nongeneric_param_0];
; PTX-NEXT: ld.param.b64 %rd2, [ptr_nongeneric_param_1];
; PTX-NEXT: ld.shared.b32 %r1, [%rd2];
; PTX-NEXT: st.global.b32 [%rd1], %r1;
; PTX-NEXT: ret;
%v = load i32, ptr addrspace(3) %in, align 4
store i32 %v, ptr addrspace(1) %out, align 4
ret void
}
define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) {
; IRC-LABEL: define ptx_kernel void @ptr_as_int(
; IRC-SAME: i64 noundef [[I:%.*]], i32 noundef [[V:%.*]]) {
; IRC-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr
; IRC-NEXT: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
; IRC-NEXT: [[P2:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr
; IRC-NEXT: store i32 [[V]], ptr [[P2]], align 4
; IRC-NEXT: ret void
;
; IRO-LABEL: define ptx_kernel void @ptr_as_int(
; IRO-SAME: i64 noundef [[I:%.*]], i32 noundef [[V:%.*]]) {
; IRO-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr
; IRO-NEXT: store i32 [[V]], ptr [[P]], align 4
; IRO-NEXT: ret void
;
; PTXC-LABEL: ptr_as_int(
; PTXC: {
; PTXC-NEXT: .reg .b32 %r<2>;
; PTXC-NEXT: .reg .b64 %rd<3>;
; PTXC-EMPTY:
; PTXC-NEXT: // %bb.0:
; PTXC-NEXT: ld.param.b64 %rd1, [ptr_as_int_param_0];
; PTXC-NEXT: ld.param.b32 %r1, [ptr_as_int_param_1];
; PTXC-NEXT: cvta.to.global.u64 %rd2, %rd1;
; PTXC-NEXT: st.global.b32 [%rd2], %r1;
; PTXC-NEXT: ret;
;
; PTXO-LABEL: ptr_as_int(
; PTXO: {
; PTXO-NEXT: .reg .b32 %r<2>;
; PTXO-NEXT: .reg .b64 %rd<2>;
; PTXO-EMPTY:
; PTXO-NEXT: // %bb.0:
; PTXO-NEXT: ld.param.b64 %rd1, [ptr_as_int_param_0];
; PTXO-NEXT: ld.param.b32 %r1, [ptr_as_int_param_1];
; PTXO-NEXT: st.b32 [%rd1], %r1;
; PTXO-NEXT: ret;
%p = inttoptr i64 %i to ptr
store i32 %v, ptr %p, align 4
ret void
}
%struct.S = type { i64 }
define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%struct.S) align 8 %s, i32 noundef %v) {
; IRC-LABEL: define ptx_kernel void @ptr_as_int_aggr(
; IRC-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) {
; IRC-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; IRC-NEXT: [[I:%.*]] = load i64, ptr addrspace(101) [[S3]], align 8
; IRC-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr
; IRC-NEXT: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
; IRC-NEXT: [[P2:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr
; IRC-NEXT: store i32 [[V]], ptr [[P2]], align 4
; IRC-NEXT: ret void
;
; IRO-LABEL: define ptx_kernel void @ptr_as_int_aggr(
; IRO-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) {
; IRO-NEXT: [[S1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; IRO-NEXT: [[I:%.*]] = load i64, ptr addrspace(101) [[S1]], align 8
; IRO-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr
; IRO-NEXT: store i32 [[V]], ptr [[P]], align 4
; IRO-NEXT: ret void
;
; PTXC-LABEL: ptr_as_int_aggr(
; PTXC: {
; PTXC-NEXT: .reg .b32 %r<2>;
; PTXC-NEXT: .reg .b64 %rd<3>;
; PTXC-EMPTY:
; PTXC-NEXT: // %bb.0:
; PTXC-NEXT: ld.param.b32 %r1, [ptr_as_int_aggr_param_1];
; PTXC-NEXT: ld.param.b64 %rd1, [ptr_as_int_aggr_param_0];
; PTXC-NEXT: cvta.to.global.u64 %rd2, %rd1;
; PTXC-NEXT: st.global.b32 [%rd2], %r1;
; PTXC-NEXT: ret;
;
; PTXO-LABEL: ptr_as_int_aggr(
; PTXO: {
; PTXO-NEXT: .reg .b32 %r<2>;
; PTXO-NEXT: .reg .b64 %rd<2>;
; PTXO-EMPTY:
; PTXO-NEXT: // %bb.0:
; PTXO-NEXT: ld.param.b32 %r1, [ptr_as_int_aggr_param_1];
; PTXO-NEXT: ld.param.b64 %rd1, [ptr_as_int_aggr_param_0];
; PTXO-NEXT: st.b32 [%rd1], %r1;
; PTXO-NEXT: ret;
%i = load i64, ptr %s, align 8
%p = inttoptr i64 %i to ptr
store i32 %v, ptr %p, align 4
ret void
}
; Function Attrs: convergent nounwind
declare dso_local ptr @escape(ptr) local_unnamed_addr