| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,LOWER-ARGS,SM_60 |
| ; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,LOWER-ARGS,SM_70 |
| ; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -passes=nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,LOWER-ARGS,SM_60 |
| ; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -passes=nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,LOWER-ARGS,SM_70 |
| ; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -passes=nvptx-copy-byval-args -S | FileCheck %s --check-prefixes=COMMON,COPY |
| ; RUN: llc < %s -mcpu=sm_60 -mattr=ptx77 | FileCheck %s --check-prefixes=PTX,PTX_60 |
| ; RUN: llc < %s -mcpu=sm_70 -mattr=ptx77 | FileCheck %s --check-prefixes=PTX,PTX_70 |
| source_filename = "<stdin>" |
| target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" |
| target triple = "nvptx64-nvidia-cuda" |
| |
| %struct.S = type { i32, i32 } |
| |
| ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) |
| declare dso_local void @_Z6escapePv(ptr noundef) local_unnamed_addr #0 |
| |
| ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) |
| declare dso_local void @_Z6escapei(i32 noundef) local_unnamed_addr #0 |
| |
| ; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite) |
| declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1 |
| |
| ; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite) |
| declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) #1 |
| |
| ; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) |
| declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2 |
| |
| ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) |
| define dso_local ptx_kernel void @read_only(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { |
| ; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only( |
| ; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { |
| ; LOWER-ARGS-NEXT: [[ENTRY:.*:]] |
| ; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4 |
| ; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4 |
| ; LOWER-ARGS-NEXT: ret void |
| ; |
| ; COPY-LABEL: define dso_local ptx_kernel void @read_only( |
| ; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { |
| ; COPY-NEXT: [[ENTRY:.*:]] |
| ; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 |
| ; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) |
| ; COPY-NEXT: [[I:%.*]] = load i32, ptr [[S1]], align 4 |
| ; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4 |
| ; COPY-NEXT: ret void |
| ; |
| ; PTX-LABEL: read_only( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<2>; |
| ; PTX-NEXT: .reg .b64 %rd<3>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: // %entry |
| ; PTX-NEXT: ld.param.u64 %rd1, [read_only_param_0]; |
| ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; PTX-NEXT: ld.param.u32 %r1, [read_only_param_1]; |
| ; PTX-NEXT: st.global.u32 [%rd2], %r1; |
| ; PTX-NEXT: ret; |
| entry: |
| %i = load i32, ptr %s, align 4 |
| store i32 %i, ptr %out, align 4 |
| ret void |
| } |
| |
| ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) |
| define dso_local ptx_kernel void @read_only_gep(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { |
| ; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only_gep( |
| ; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { |
| ; LOWER-ARGS-NEXT: [[ENTRY:.*:]] |
| ; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; LOWER-ARGS-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 |
| ; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 |
| ; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4 |
| ; LOWER-ARGS-NEXT: ret void |
| ; |
| ; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep( |
| ; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { |
| ; COPY-NEXT: [[ENTRY:.*:]] |
| ; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 |
| ; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) |
| ; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 |
| ; COPY-NEXT: [[I:%.*]] = load i32, ptr [[B]], align 4 |
| ; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4 |
| ; COPY-NEXT: ret void |
| ; |
| ; PTX-LABEL: read_only_gep( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<2>; |
| ; PTX-NEXT: .reg .b64 %rd<3>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: // %entry |
| ; PTX-NEXT: ld.param.u64 %rd1, [read_only_gep_param_0]; |
| ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; PTX-NEXT: ld.param.u32 %r1, [read_only_gep_param_1+4]; |
| ; PTX-NEXT: st.global.u32 [%rd2], %r1; |
| ; PTX-NEXT: ret; |
| entry: |
| %b = getelementptr inbounds nuw i8, ptr %s, i64 4 |
| %i = load i32, ptr %b, align 4 |
| store i32 %i, ptr %out, align 4 |
| ret void |
| } |
| |
| ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) |
| define dso_local ptx_kernel void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { |
| ; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only_gep_asc( |
| ; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { |
| ; LOWER-ARGS-NEXT: [[ENTRY:.*:]] |
| ; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; LOWER-ARGS-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 |
| ; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 |
| ; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4 |
| ; LOWER-ARGS-NEXT: ret void |
| ; |
| ; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep_asc( |
| ; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { |
| ; COPY-NEXT: [[ENTRY:.*:]] |
| ; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 |
| ; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) |
| ; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 |
| ; COPY-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) |
| ; COPY-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[ASC]], align 4 |
| ; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4 |
| ; COPY-NEXT: ret void |
| ; |
| ; PTX-LABEL: read_only_gep_asc( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<2>; |
| ; PTX-NEXT: .reg .b64 %rd<3>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: // %entry |
| ; PTX-NEXT: ld.param.u64 %rd1, [read_only_gep_asc_param_0]; |
| ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; PTX-NEXT: ld.param.u32 %r1, [read_only_gep_asc_param_1+4]; |
| ; PTX-NEXT: st.global.u32 [%rd2], %r1; |
| ; PTX-NEXT: ret; |
| entry: |
| %b = getelementptr inbounds nuw i8, ptr %s, i64 4 |
| %asc = addrspacecast ptr %b to ptr addrspace(101) |
| %i = load i32, ptr addrspace(101) %asc, align 4 |
| store i32 %i, ptr %out, align 4 |
| ret void |
| } |
| |
| ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) |
| define dso_local ptx_kernel void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { |
| ; COMMON-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0( |
| ; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { |
| ; COMMON-NEXT: [[ENTRY:.*:]] |
| ; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 |
| ; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) |
| ; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 |
| ; COMMON-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) |
| ; COMMON-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr |
| ; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4 |
| ; COMMON-NEXT: store i32 [[I]], ptr [[OUT]], align 4 |
| ; COMMON-NEXT: ret void |
| ; |
| ; PTX-LABEL: read_only_gep_asc0( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<2>; |
| ; PTX-NEXT: .reg .b64 %rd<3>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: // %entry |
| ; PTX-NEXT: ld.param.u64 %rd1, [read_only_gep_asc0_param_0]; |
| ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; PTX-NEXT: ld.param.u32 %r1, [read_only_gep_asc0_param_1+4]; |
| ; PTX-NEXT: st.global.u32 [%rd2], %r1; |
| ; PTX-NEXT: ret; |
| entry: |
| %b = getelementptr inbounds nuw i8, ptr %s, i64 4 |
| %asc = addrspacecast ptr %b to ptr addrspace(101) |
| %asc0 = addrspacecast ptr addrspace(101) %asc to ptr |
| %i = load i32, ptr %asc0, align 4 |
| store i32 %i, ptr %out, align 4 |
| ret void |
| } |
| |
| ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) |
| define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { |
| ; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr( |
| ; SM_60-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { |
| ; SM_60-NEXT: [[ENTRY:.*:]] |
| ; SM_60-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 |
| ; SM_60-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) |
| ; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR5:[0-9]+]] |
| ; SM_60-NEXT: ret void |
| ; |
| ; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr( |
| ; SM_70-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { |
| ; SM_70-NEXT: [[ENTRY:.*:]] |
| ; SM_70-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 |
| ; SM_70-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) |
| ; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR6:[0-9]+]] |
| ; SM_70-NEXT: ret void |
| ; |
| ; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr( |
| ; COPY-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { |
| ; COPY-NEXT: [[ENTRY:.*:]] |
| ; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 |
| ; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) |
| ; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR5:[0-9]+]] |
| ; COPY-NEXT: ret void |
| ; |
| ; PTX-LABEL: escape_ptr( |
| ; PTX: { |
| ; PTX-NEXT: .local .align 4 .b8 __local_depot4[8]; |
| ; PTX-NEXT: .reg .b64 %SP; |
| ; PTX-NEXT: .reg .b64 %SPL; |
| ; PTX-NEXT: .reg .b32 %r<3>; |
| ; PTX-NEXT: .reg .b64 %rd<3>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: // %entry |
| ; PTX-NEXT: mov.b64 %SPL, __local_depot4; |
| ; PTX-NEXT: cvta.local.u64 %SP, %SPL; |
| ; PTX-NEXT: add.u64 %rd1, %SP, 0; |
| ; PTX-NEXT: add.u64 %rd2, %SPL, 0; |
| ; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_param_1+4]; |
| ; PTX-NEXT: st.local.u32 [%rd2+4], %r1; |
| ; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_param_1]; |
| ; PTX-NEXT: st.local.u32 [%rd2], %r2; |
| ; PTX-NEXT: { // callseq 0, 0 |
| ; PTX-NEXT: .param .b64 param0; |
| ; PTX-NEXT: st.param.b64 [param0], %rd1; |
| ; PTX-NEXT: call.uni |
| ; PTX-NEXT: _Z6escapePv, |
| ; PTX-NEXT: ( |
| ; PTX-NEXT: param0 |
| ; PTX-NEXT: ); |
| ; PTX-NEXT: } // callseq 0 |
| ; PTX-NEXT: ret; |
| entry: |
| call void @_Z6escapePv(ptr noundef nonnull %s) #0 |
| ret void |
| } |
| |
| ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) |
| define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { |
| ; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_gep( |
| ; SM_60-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { |
| ; SM_60-NEXT: [[ENTRY:.*:]] |
| ; SM_60-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 |
| ; SM_60-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) |
| ; SM_60-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 |
| ; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR5]] |
| ; SM_60-NEXT: ret void |
| ; |
| ; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_gep( |
| ; SM_70-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { |
| ; SM_70-NEXT: [[ENTRY:.*:]] |
| ; SM_70-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 |
| ; SM_70-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) |
| ; SM_70-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 |
| ; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR6]] |
| ; SM_70-NEXT: ret void |
| ; |
| ; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_gep( |
| ; COPY-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { |
| ; COPY-NEXT: [[ENTRY:.*:]] |
| ; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 |
| ; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) |
| ; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 |
| ; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR5]] |
| ; COPY-NEXT: ret void |
| ; |
| ; PTX-LABEL: escape_ptr_gep( |
| ; PTX: { |
| ; PTX-NEXT: .local .align 4 .b8 __local_depot5[8]; |
| ; PTX-NEXT: .reg .b64 %SP; |
| ; PTX-NEXT: .reg .b64 %SPL; |
| ; PTX-NEXT: .reg .b32 %r<3>; |
| ; PTX-NEXT: .reg .b64 %rd<4>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: // %entry |
| ; PTX-NEXT: mov.b64 %SPL, __local_depot5; |
| ; PTX-NEXT: cvta.local.u64 %SP, %SPL; |
| ; PTX-NEXT: add.u64 %rd1, %SP, 0; |
| ; PTX-NEXT: add.u64 %rd2, %SPL, 0; |
| ; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_gep_param_1+4]; |
| ; PTX-NEXT: st.local.u32 [%rd2+4], %r1; |
| ; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_gep_param_1]; |
| ; PTX-NEXT: st.local.u32 [%rd2], %r2; |
| ; PTX-NEXT: add.s64 %rd3, %rd1, 4; |
| ; PTX-NEXT: { // callseq 1, 0 |
| ; PTX-NEXT: .param .b64 param0; |
| ; PTX-NEXT: st.param.b64 [param0], %rd3; |
| ; PTX-NEXT: call.uni |
| ; PTX-NEXT: _Z6escapePv, |
| ; PTX-NEXT: ( |
| ; PTX-NEXT: param0 |
| ; PTX-NEXT: ); |
| ; PTX-NEXT: } // callseq 1 |
| ; PTX-NEXT: ret; |
| entry: |
| %b = getelementptr inbounds nuw i8, ptr %s, i64 4 |
| call void @_Z6escapePv(ptr noundef nonnull %b) #0 |
| ret void |
| } |
| |
| ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) |
| define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { |
| ; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptr_store( |
| ; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { |
| ; COMMON-NEXT: [[ENTRY:.*:]] |
| ; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 |
| ; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) |
| ; COMMON-NEXT: store ptr [[S1]], ptr [[OUT]], align 8 |
| ; COMMON-NEXT: ret void |
| ; |
| ; PTX-LABEL: escape_ptr_store( |
| ; PTX: { |
| ; PTX-NEXT: .local .align 4 .b8 __local_depot6[8]; |
| ; PTX-NEXT: .reg .b64 %SP; |
| ; PTX-NEXT: .reg .b64 %SPL; |
| ; PTX-NEXT: .reg .b32 %r<3>; |
| ; PTX-NEXT: .reg .b64 %rd<5>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: // %entry |
| ; PTX-NEXT: mov.b64 %SPL, __local_depot6; |
| ; PTX-NEXT: cvta.local.u64 %SP, %SPL; |
| ; PTX-NEXT: ld.param.u64 %rd1, [escape_ptr_store_param_0]; |
| ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; PTX-NEXT: add.u64 %rd3, %SP, 0; |
| ; PTX-NEXT: add.u64 %rd4, %SPL, 0; |
| ; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_store_param_1+4]; |
| ; PTX-NEXT: st.local.u32 [%rd4+4], %r1; |
| ; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_store_param_1]; |
| ; PTX-NEXT: st.local.u32 [%rd4], %r2; |
| ; PTX-NEXT: st.global.u64 [%rd2], %rd3; |
| ; PTX-NEXT: ret; |
| entry: |
| store ptr %s, ptr %out, align 8 |
| ret void |
| } |
| |
| ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) |
| define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { |
| ; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store( |
| ; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { |
| ; COMMON-NEXT: [[ENTRY:.*:]] |
| ; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 |
| ; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) |
| ; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 |
| ; COMMON-NEXT: store ptr [[B]], ptr [[OUT]], align 8 |
| ; COMMON-NEXT: ret void |
| ; |
| ; PTX-LABEL: escape_ptr_gep_store( |
| ; PTX: { |
| ; PTX-NEXT: .local .align 4 .b8 __local_depot7[8]; |
| ; PTX-NEXT: .reg .b64 %SP; |
| ; PTX-NEXT: .reg .b64 %SPL; |
| ; PTX-NEXT: .reg .b32 %r<3>; |
| ; PTX-NEXT: .reg .b64 %rd<6>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: // %entry |
| ; PTX-NEXT: mov.b64 %SPL, __local_depot7; |
| ; PTX-NEXT: cvta.local.u64 %SP, %SPL; |
| ; PTX-NEXT: ld.param.u64 %rd1, [escape_ptr_gep_store_param_0]; |
| ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; PTX-NEXT: add.u64 %rd3, %SP, 0; |
| ; PTX-NEXT: add.u64 %rd4, %SPL, 0; |
| ; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_gep_store_param_1+4]; |
| ; PTX-NEXT: st.local.u32 [%rd4+4], %r1; |
| ; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_gep_store_param_1]; |
| ; PTX-NEXT: st.local.u32 [%rd4], %r2; |
| ; PTX-NEXT: add.s64 %rd5, %rd3, 4; |
| ; PTX-NEXT: st.global.u64 [%rd2], %rd5; |
| ; PTX-NEXT: ret; |
| entry: |
| %b = getelementptr inbounds nuw i8, ptr %s, i64 4 |
| store ptr %b, ptr %out, align 8 |
| ret void |
| } |
| |
| ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) |
| define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { |
| ; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptrtoint( |
| ; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { |
| ; COMMON-NEXT: [[ENTRY:.*:]] |
| ; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 |
| ; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) |
| ; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S1]] to i64 |
| ; COMMON-NEXT: store i64 [[I]], ptr [[OUT]], align 8 |
| ; COMMON-NEXT: ret void |
| ; |
| ; PTX-LABEL: escape_ptrtoint( |
| ; PTX: { |
| ; PTX-NEXT: .local .align 4 .b8 __local_depot8[8]; |
| ; PTX-NEXT: .reg .b64 %SP; |
| ; PTX-NEXT: .reg .b64 %SPL; |
| ; PTX-NEXT: .reg .b32 %r<3>; |
| ; PTX-NEXT: .reg .b64 %rd<5>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: // %entry |
| ; PTX-NEXT: mov.b64 %SPL, __local_depot8; |
| ; PTX-NEXT: cvta.local.u64 %SP, %SPL; |
| ; PTX-NEXT: ld.param.u64 %rd1, [escape_ptrtoint_param_0]; |
| ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; PTX-NEXT: add.u64 %rd3, %SP, 0; |
| ; PTX-NEXT: add.u64 %rd4, %SPL, 0; |
| ; PTX-NEXT: ld.param.u32 %r1, [escape_ptrtoint_param_1+4]; |
| ; PTX-NEXT: st.local.u32 [%rd4+4], %r1; |
| ; PTX-NEXT: ld.param.u32 %r2, [escape_ptrtoint_param_1]; |
| ; PTX-NEXT: st.local.u32 [%rd4], %r2; |
| ; PTX-NEXT: st.global.u64 [%rd2], %rd3; |
| ; PTX-NEXT: ret; |
| entry: |
| %i = ptrtoint ptr %s to i64 |
| store i64 %i, ptr %out, align 8 |
| ret void |
| } |
| |
| ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) |
| define dso_local ptx_kernel void @memcpy_from_param(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { |
| ; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_from_param( |
| ; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { |
| ; LOWER-ARGS-NEXT: [[ENTRY:.*:]] |
| ; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT]], ptr addrspace(101) [[S3]], i64 16, i1 true) |
| ; LOWER-ARGS-NEXT: ret void |
| ; |
| ; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param( |
| ; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { |
| ; COPY-NEXT: [[ENTRY:.*:]] |
| ; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 |
| ; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) |
| ; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S1]], i64 16, i1 true) |
| ; COPY-NEXT: ret void |
| ; |
| ; PTX-LABEL: memcpy_from_param( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b16 %rs<17>; |
| ; PTX-NEXT: .reg .b64 %rd<2>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: // %entry |
| ; PTX-NEXT: ld.param.u64 %rd1, [memcpy_from_param_param_0]; |
| ; PTX-NEXT: ld.param.u8 %rs1, [memcpy_from_param_param_1+15]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+15], %rs1; |
| ; PTX-NEXT: ld.param.u8 %rs2, [memcpy_from_param_param_1+14]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+14], %rs2; |
| ; PTX-NEXT: ld.param.u8 %rs3, [memcpy_from_param_param_1+13]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+13], %rs3; |
| ; PTX-NEXT: ld.param.u8 %rs4, [memcpy_from_param_param_1+12]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+12], %rs4; |
| ; PTX-NEXT: ld.param.u8 %rs5, [memcpy_from_param_param_1+11]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+11], %rs5; |
| ; PTX-NEXT: ld.param.u8 %rs6, [memcpy_from_param_param_1+10]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+10], %rs6; |
| ; PTX-NEXT: ld.param.u8 %rs7, [memcpy_from_param_param_1+9]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+9], %rs7; |
| ; PTX-NEXT: ld.param.u8 %rs8, [memcpy_from_param_param_1+8]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+8], %rs8; |
| ; PTX-NEXT: ld.param.u8 %rs9, [memcpy_from_param_param_1+7]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+7], %rs9; |
| ; PTX-NEXT: ld.param.u8 %rs10, [memcpy_from_param_param_1+6]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+6], %rs10; |
| ; PTX-NEXT: ld.param.u8 %rs11, [memcpy_from_param_param_1+5]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+5], %rs11; |
| ; PTX-NEXT: ld.param.u8 %rs12, [memcpy_from_param_param_1+4]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+4], %rs12; |
| ; PTX-NEXT: ld.param.u8 %rs13, [memcpy_from_param_param_1+3]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+3], %rs13; |
| ; PTX-NEXT: ld.param.u8 %rs14, [memcpy_from_param_param_1+2]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+2], %rs14; |
| ; PTX-NEXT: ld.param.u8 %rs15, [memcpy_from_param_param_1+1]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+1], %rs15; |
| ; PTX-NEXT: ld.param.u8 %rs16, [memcpy_from_param_param_1]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1], %rs16; |
| ; PTX-NEXT: ret; |
| entry: |
| tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true) |
| ret void |
| } |
| |
| ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) |
| define dso_local ptx_kernel void @memcpy_from_param_noalign (ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) %s) local_unnamed_addr #0 { |
| ; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign( |
| ; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { |
| ; LOWER-ARGS-NEXT: [[ENTRY:.*:]] |
| ; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT]], ptr addrspace(101) [[S3]], i64 16, i1 true) |
| ; LOWER-ARGS-NEXT: ret void |
| ; |
| ; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign( |
| ; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { |
| ; COPY-NEXT: [[ENTRY:.*:]] |
| ; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 8 |
| ; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[S1]], ptr addrspace(101) align 8 [[S2]], i64 8, i1 false) |
| ; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S1]], i64 16, i1 true) |
| ; COPY-NEXT: ret void |
| ; |
| ; PTX-LABEL: memcpy_from_param_noalign( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b16 %rs<17>; |
| ; PTX-NEXT: .reg .b64 %rd<2>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: // %entry |
| ; PTX-NEXT: ld.param.u64 %rd1, [memcpy_from_param_noalign_param_0]; |
| ; PTX-NEXT: ld.param.u8 %rs1, [memcpy_from_param_noalign_param_1+15]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+15], %rs1; |
| ; PTX-NEXT: ld.param.u8 %rs2, [memcpy_from_param_noalign_param_1+14]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+14], %rs2; |
| ; PTX-NEXT: ld.param.u8 %rs3, [memcpy_from_param_noalign_param_1+13]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+13], %rs3; |
| ; PTX-NEXT: ld.param.u8 %rs4, [memcpy_from_param_noalign_param_1+12]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+12], %rs4; |
| ; PTX-NEXT: ld.param.u8 %rs5, [memcpy_from_param_noalign_param_1+11]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+11], %rs5; |
| ; PTX-NEXT: ld.param.u8 %rs6, [memcpy_from_param_noalign_param_1+10]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+10], %rs6; |
| ; PTX-NEXT: ld.param.u8 %rs7, [memcpy_from_param_noalign_param_1+9]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+9], %rs7; |
| ; PTX-NEXT: ld.param.u8 %rs8, [memcpy_from_param_noalign_param_1+8]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+8], %rs8; |
| ; PTX-NEXT: ld.param.u8 %rs9, [memcpy_from_param_noalign_param_1+7]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+7], %rs9; |
| ; PTX-NEXT: ld.param.u8 %rs10, [memcpy_from_param_noalign_param_1+6]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+6], %rs10; |
| ; PTX-NEXT: ld.param.u8 %rs11, [memcpy_from_param_noalign_param_1+5]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+5], %rs11; |
| ; PTX-NEXT: ld.param.u8 %rs12, [memcpy_from_param_noalign_param_1+4]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+4], %rs12; |
| ; PTX-NEXT: ld.param.u8 %rs13, [memcpy_from_param_noalign_param_1+3]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+3], %rs13; |
| ; PTX-NEXT: ld.param.u8 %rs14, [memcpy_from_param_noalign_param_1+2]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+2], %rs14; |
| ; PTX-NEXT: ld.param.u8 %rs15, [memcpy_from_param_noalign_param_1+1]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1+1], %rs15; |
| ; PTX-NEXT: ld.param.u8 %rs16, [memcpy_from_param_noalign_param_1]; |
| ; PTX-NEXT: st.volatile.u8 [%rd1], %rs16; |
| ; PTX-NEXT: ret; |
| entry: |
| tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true) |
| ret void |
| } |
| |
| ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) |
| define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 { |
| ; COMMON-LABEL: define dso_local ptx_kernel void @memcpy_to_param( |
| ; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef readnone byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { |
| ; COMMON-NEXT: [[ENTRY:.*:]] |
| ; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 |
| ; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) |
| ; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S1]], ptr [[IN]], i64 16, i1 true) |
| ; COMMON-NEXT: ret void |
| ; |
| ; PTX-LABEL: memcpy_to_param( |
| ; PTX: { |
| ; PTX-NEXT: .local .align 8 .b8 __local_depot11[8]; |
| ; PTX-NEXT: .reg .b64 %SP; |
| ; PTX-NEXT: .reg .b64 %SPL; |
| ; PTX-NEXT: .reg .b32 %r<3>; |
| ; PTX-NEXT: .reg .b64 %rd<48>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: // %entry |
| ; PTX-NEXT: mov.b64 %SPL, __local_depot11; |
| ; PTX-NEXT: cvta.local.u64 %SP, %SPL; |
| ; PTX-NEXT: ld.param.u64 %rd1, [memcpy_to_param_param_0]; |
| ; PTX-NEXT: add.u64 %rd3, %SPL, 0; |
| ; PTX-NEXT: ld.param.u32 %r1, [memcpy_to_param_param_1+4]; |
| ; PTX-NEXT: st.local.u32 [%rd3+4], %r1; |
| ; PTX-NEXT: ld.param.u32 %r2, [memcpy_to_param_param_1]; |
| ; PTX-NEXT: st.local.u32 [%rd3], %r2; |
| ; PTX-NEXT: ld.volatile.u8 %rd4, [%rd1]; |
| ; PTX-NEXT: ld.volatile.u8 %rd5, [%rd1+1]; |
| ; PTX-NEXT: shl.b64 %rd6, %rd5, 8; |
| ; PTX-NEXT: or.b64 %rd7, %rd6, %rd4; |
| ; PTX-NEXT: ld.volatile.u8 %rd8, [%rd1+2]; |
| ; PTX-NEXT: shl.b64 %rd9, %rd8, 16; |
| ; PTX-NEXT: ld.volatile.u8 %rd10, [%rd1+3]; |
| ; PTX-NEXT: shl.b64 %rd11, %rd10, 24; |
| ; PTX-NEXT: or.b64 %rd12, %rd11, %rd9; |
| ; PTX-NEXT: or.b64 %rd13, %rd12, %rd7; |
| ; PTX-NEXT: ld.volatile.u8 %rd14, [%rd1+4]; |
| ; PTX-NEXT: ld.volatile.u8 %rd15, [%rd1+5]; |
| ; PTX-NEXT: shl.b64 %rd16, %rd15, 8; |
| ; PTX-NEXT: or.b64 %rd17, %rd16, %rd14; |
| ; PTX-NEXT: ld.volatile.u8 %rd18, [%rd1+6]; |
| ; PTX-NEXT: shl.b64 %rd19, %rd18, 16; |
| ; PTX-NEXT: ld.volatile.u8 %rd20, [%rd1+7]; |
| ; PTX-NEXT: shl.b64 %rd21, %rd20, 24; |
| ; PTX-NEXT: or.b64 %rd22, %rd21, %rd19; |
| ; PTX-NEXT: or.b64 %rd23, %rd22, %rd17; |
| ; PTX-NEXT: shl.b64 %rd24, %rd23, 32; |
| ; PTX-NEXT: or.b64 %rd25, %rd24, %rd13; |
| ; PTX-NEXT: st.volatile.u64 [%SP], %rd25; |
| ; PTX-NEXT: ld.volatile.u8 %rd26, [%rd1+8]; |
| ; PTX-NEXT: ld.volatile.u8 %rd27, [%rd1+9]; |
| ; PTX-NEXT: shl.b64 %rd28, %rd27, 8; |
| ; PTX-NEXT: or.b64 %rd29, %rd28, %rd26; |
| ; PTX-NEXT: ld.volatile.u8 %rd30, [%rd1+10]; |
| ; PTX-NEXT: shl.b64 %rd31, %rd30, 16; |
| ; PTX-NEXT: ld.volatile.u8 %rd32, [%rd1+11]; |
| ; PTX-NEXT: shl.b64 %rd33, %rd32, 24; |
| ; PTX-NEXT: or.b64 %rd34, %rd33, %rd31; |
| ; PTX-NEXT: or.b64 %rd35, %rd34, %rd29; |
| ; PTX-NEXT: ld.volatile.u8 %rd36, [%rd1+12]; |
| ; PTX-NEXT: ld.volatile.u8 %rd37, [%rd1+13]; |
| ; PTX-NEXT: shl.b64 %rd38, %rd37, 8; |
| ; PTX-NEXT: or.b64 %rd39, %rd38, %rd36; |
| ; PTX-NEXT: ld.volatile.u8 %rd40, [%rd1+14]; |
| ; PTX-NEXT: shl.b64 %rd41, %rd40, 16; |
| ; PTX-NEXT: ld.volatile.u8 %rd42, [%rd1+15]; |
| ; PTX-NEXT: shl.b64 %rd43, %rd42, 24; |
| ; PTX-NEXT: or.b64 %rd44, %rd43, %rd41; |
| ; PTX-NEXT: or.b64 %rd45, %rd44, %rd39; |
| ; PTX-NEXT: shl.b64 %rd46, %rd45, 32; |
| ; PTX-NEXT: or.b64 %rd47, %rd46, %rd35; |
| ; PTX-NEXT: st.volatile.u64 [%SP+8], %rd47; |
| ; PTX-NEXT: ret; |
| entry: |
| tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true) |
| ret void |
| } |
| |
| ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) |
| define dso_local ptx_kernel void @copy_on_store(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s, i1 noundef zeroext %b) local_unnamed_addr #0 { |
| ; COMMON-LABEL: define dso_local ptx_kernel void @copy_on_store( |
| ; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { |
| ; COMMON-NEXT: [[BB:.*:]] |
| ; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 |
| ; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) |
| ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) |
| ; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN]], align 4 |
| ; COMMON-NEXT: store i32 [[I]], ptr [[S1]], align 4 |
| ; COMMON-NEXT: ret void |
| ; |
| ; PTX-LABEL: copy_on_store( |
| ; PTX: { |
| ; PTX-EMPTY: |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: // %bb |
| ; PTX-NEXT: ret; |
| bb: |
| %i = load i32, ptr %in, align 4 |
| store i32 %i, ptr %s, align 4 |
| ret void |
| } |
| |
| define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) { |
| ; SM_60-LABEL: define ptx_kernel void @test_select( |
| ; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { |
| ; SM_60-NEXT: [[BB:.*:]] |
| ; SM_60-NEXT: [[INPUT24:%.*]] = alloca i32, align 4 |
| ; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) |
| ; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT24]], ptr addrspace(101) align 4 [[INPUT25]], i64 4, i1 false) |
| ; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 |
| ; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) |
| ; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) |
| ; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]] |
| ; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 |
| ; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4 |
| ; SM_60-NEXT: ret void |
| ; |
| ; SM_70-LABEL: define ptx_kernel void @test_select( |
| ; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { |
| ; SM_70-NEXT: [[BB:.*:]] |
| ; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) |
| ; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) |
| ; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) |
| ; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) |
| ; SM_70-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]] |
| ; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 |
| ; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4 |
| ; SM_70-NEXT: ret void |
| ; |
| ; COPY-LABEL: define ptx_kernel void @test_select( |
| ; COPY-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { |
| ; COPY-NEXT: [[BB:.*:]] |
| ; COPY-NEXT: [[INPUT23:%.*]] = alloca i32, align 4 |
| ; COPY-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) |
| ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false) |
| ; COPY-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 |
| ; COPY-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) |
| ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) |
| ; COPY-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]] |
| ; COPY-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 |
| ; COPY-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4 |
| ; COPY-NEXT: ret void |
| ; |
| ; PTX_60-LABEL: test_select( |
| ; PTX_60: { |
| ; PTX_60-NEXT: .reg .pred %p<2>; |
| ; PTX_60-NEXT: .reg .b16 %rs<3>; |
| ; PTX_60-NEXT: .reg .b32 %r<4>; |
| ; PTX_60-NEXT: .reg .b64 %rd<3>; |
| ; PTX_60-EMPTY: |
| ; PTX_60-NEXT: // %bb.0: // %bb |
| ; PTX_60-NEXT: ld.param.u8 %rs1, [test_select_param_3]; |
| ; PTX_60-NEXT: and.b16 %rs2, %rs1, 1; |
| ; PTX_60-NEXT: setp.eq.b16 %p1, %rs2, 1; |
| ; PTX_60-NEXT: ld.param.u64 %rd1, [test_select_param_2]; |
| ; PTX_60-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; PTX_60-NEXT: ld.param.u32 %r1, [test_select_param_1]; |
| ; PTX_60-NEXT: ld.param.u32 %r2, [test_select_param_0]; |
| ; PTX_60-NEXT: selp.b32 %r3, %r2, %r1, %p1; |
| ; PTX_60-NEXT: st.global.u32 [%rd2], %r3; |
| ; PTX_60-NEXT: ret; |
| ; |
| ; PTX_70-LABEL: test_select( |
| ; PTX_70: { |
| ; PTX_70-NEXT: .reg .pred %p<2>; |
| ; PTX_70-NEXT: .reg .b16 %rs<3>; |
| ; PTX_70-NEXT: .reg .b32 %r<2>; |
| ; PTX_70-NEXT: .reg .b64 %rd<10>; |
| ; PTX_70-EMPTY: |
| ; PTX_70-NEXT: // %bb.0: // %bb |
| ; PTX_70-NEXT: ld.param.u8 %rs1, [test_select_param_3]; |
| ; PTX_70-NEXT: and.b16 %rs2, %rs1, 1; |
| ; PTX_70-NEXT: setp.eq.b16 %p1, %rs2, 1; |
| ; PTX_70-NEXT: mov.b64 %rd1, test_select_param_0; |
| ; PTX_70-NEXT: ld.param.u64 %rd2, [test_select_param_2]; |
| ; PTX_70-NEXT: cvta.to.global.u64 %rd3, %rd2; |
| ; PTX_70-NEXT: mov.b64 %rd4, test_select_param_1; |
| ; PTX_70-NEXT: mov.b64 %rd5, %rd4; |
| ; PTX_70-NEXT: cvta.param.u64 %rd6, %rd5; |
| ; PTX_70-NEXT: mov.b64 %rd7, %rd1; |
| ; PTX_70-NEXT: cvta.param.u64 %rd8, %rd7; |
| ; PTX_70-NEXT: selp.b64 %rd9, %rd8, %rd6, %p1; |
| ; PTX_70-NEXT: ld.u32 %r1, [%rd9]; |
| ; PTX_70-NEXT: st.global.u32 [%rd3], %r1; |
| ; PTX_70-NEXT: ret; |
| bb: |
| %ptrnew = select i1 %cond, ptr %input1, ptr %input2 |
| %valloaded = load i32, ptr %ptrnew, align 4 |
| store i32 %valloaded, ptr %out, align 4 |
| ret void |
| } |
| |
| define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) { |
| ; COMMON-LABEL: define ptx_kernel void @test_select_write( |
| ; COMMON-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { |
| ; COMMON-NEXT: [[BB:.*:]] |
| ; COMMON-NEXT: [[INPUT23:%.*]] = alloca i32, align 4 |
| ; COMMON-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) |
| ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false) |
| ; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 |
| ; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) |
| ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) |
| ; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]] |
| ; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4 |
| ; COMMON-NEXT: ret void |
| ; |
| ; PTX-LABEL: test_select_write( |
| ; PTX: { |
| ; PTX-NEXT: .local .align 4 .b8 __local_depot14[8]; |
| ; PTX-NEXT: .reg .b64 %SP; |
| ; PTX-NEXT: .reg .b64 %SPL; |
| ; PTX-NEXT: .reg .pred %p<2>; |
| ; PTX-NEXT: .reg .b16 %rs<3>; |
| ; PTX-NEXT: .reg .b32 %r<4>; |
| ; PTX-NEXT: .reg .b64 %rd<6>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: // %bb |
| ; PTX-NEXT: mov.b64 %SPL, __local_depot14; |
| ; PTX-NEXT: cvta.local.u64 %SP, %SPL; |
| ; PTX-NEXT: ld.param.u8 %rs1, [test_select_write_param_3]; |
| ; PTX-NEXT: and.b16 %rs2, %rs1, 1; |
| ; PTX-NEXT: setp.eq.b16 %p1, %rs2, 1; |
| ; PTX-NEXT: ld.param.u32 %r1, [test_select_write_param_1]; |
| ; PTX-NEXT: st.u32 [%SP], %r1; |
| ; PTX-NEXT: ld.param.u32 %r2, [test_select_write_param_0]; |
| ; PTX-NEXT: st.u32 [%SP+4], %r2; |
| ; PTX-NEXT: add.u64 %rd2, %SPL, 4; |
| ; PTX-NEXT: add.u64 %rd4, %SPL, 0; |
| ; PTX-NEXT: selp.b64 %rd5, %rd2, %rd4, %p1; |
| ; PTX-NEXT: mov.b32 %r3, 1; |
| ; PTX-NEXT: st.local.u32 [%rd5], %r3; |
| ; PTX-NEXT: ret; |
| bb: |
| %ptrnew = select i1 %cond, ptr %input1, ptr %input2 |
| store i32 1, ptr %ptrnew, align 4 |
| ret void |
| } |
| |
| define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, ptr %inout, i1 %cond) { |
| ; SM_60-LABEL: define ptx_kernel void @test_phi( |
| ; SM_60-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { |
| ; SM_60-NEXT: [[BB:.*:]] |
| ; SM_60-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 |
| ; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) |
| ; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false) |
| ; SM_60-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 |
| ; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) |
| ; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false) |
| ; SM_60-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] |
| ; SM_60: [[FIRST]]: |
| ; SM_60-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0 |
| ; SM_60-NEXT: br label %[[MERGE:.*]] |
| ; SM_60: [[SECOND]]: |
| ; SM_60-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1 |
| ; SM_60-NEXT: br label %[[MERGE]] |
| ; SM_60: [[MERGE]]: |
| ; SM_60-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] |
| ; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 |
| ; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4 |
| ; SM_60-NEXT: ret void |
| ; |
| ; SM_70-LABEL: define ptx_kernel void @test_phi( |
| ; SM_70-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { |
| ; SM_70-NEXT: [[BB:.*:]] |
| ; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) |
| ; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) |
| ; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) |
| ; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) |
| ; SM_70-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] |
| ; SM_70: [[FIRST]]: |
| ; SM_70-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0 |
| ; SM_70-NEXT: br label %[[MERGE:.*]] |
| ; SM_70: [[SECOND]]: |
| ; SM_70-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT2_PARAM_GEN]], i32 0, i32 1 |
| ; SM_70-NEXT: br label %[[MERGE]] |
| ; SM_70: [[MERGE]]: |
| ; SM_70-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] |
| ; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 |
| ; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4 |
| ; SM_70-NEXT: ret void |
| ; |
| ; COPY-LABEL: define ptx_kernel void @test_phi( |
| ; COPY-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { |
| ; COPY-NEXT: [[BB:.*:]] |
| ; COPY-NEXT: [[INPUT23:%.*]] = alloca [[STRUCT_S]], align 8 |
| ; COPY-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) |
| ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT23]], ptr addrspace(101) align 8 [[INPUT24]], i64 8, i1 false) |
| ; COPY-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 |
| ; COPY-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) |
| ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false) |
| ; COPY-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] |
| ; COPY: [[FIRST]]: |
| ; COPY-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0 |
| ; COPY-NEXT: br label %[[MERGE:.*]] |
| ; COPY: [[SECOND]]: |
| ; COPY-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT23]], i32 0, i32 1 |
| ; COPY-NEXT: br label %[[MERGE]] |
| ; COPY: [[MERGE]]: |
| ; COPY-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] |
| ; COPY-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 |
| ; COPY-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4 |
| ; COPY-NEXT: ret void |
| ; |
| ; PTX_60-LABEL: test_phi( |
| ; PTX_60: { |
| ; PTX_60-NEXT: .reg .pred %p<2>; |
| ; PTX_60-NEXT: .reg .b16 %rs<3>; |
| ; PTX_60-NEXT: .reg .b32 %r<5>; |
| ; PTX_60-NEXT: .reg .b64 %rd<3>; |
| ; PTX_60-EMPTY: |
| ; PTX_60-NEXT: // %bb.0: // %bb |
| ; PTX_60-NEXT: ld.param.u8 %rs1, [test_phi_param_3]; |
| ; PTX_60-NEXT: and.b16 %rs2, %rs1, 1; |
| ; PTX_60-NEXT: setp.eq.b16 %p1, %rs2, 1; |
| ; PTX_60-NEXT: ld.param.u64 %rd2, [test_phi_param_2]; |
| ; PTX_60-NEXT: cvta.to.global.u64 %rd1, %rd2; |
| ; PTX_60-NEXT: ld.param.u32 %r4, [test_phi_param_0]; |
| ; PTX_60-NEXT: @%p1 bra $L__BB15_2; |
| ; PTX_60-NEXT: // %bb.1: // %second |
| ; PTX_60-NEXT: ld.param.u32 %r4, [test_phi_param_1+4]; |
| ; PTX_60-NEXT: $L__BB15_2: // %merge |
| ; PTX_60-NEXT: st.global.u32 [%rd1], %r4; |
| ; PTX_60-NEXT: ret; |
| ; |
| ; PTX_70-LABEL: test_phi( |
| ; PTX_70: { |
| ; PTX_70-NEXT: .reg .pred %p<2>; |
| ; PTX_70-NEXT: .reg .b16 %rs<3>; |
| ; PTX_70-NEXT: .reg .b32 %r<2>; |
| ; PTX_70-NEXT: .reg .b64 %rd<12>; |
| ; PTX_70-EMPTY: |
| ; PTX_70-NEXT: // %bb.0: // %bb |
| ; PTX_70-NEXT: ld.param.u8 %rs1, [test_phi_param_3]; |
| ; PTX_70-NEXT: and.b16 %rs2, %rs1, 1; |
| ; PTX_70-NEXT: setp.eq.b16 %p1, %rs2, 1; |
| ; PTX_70-NEXT: mov.b64 %rd6, test_phi_param_0; |
| ; PTX_70-NEXT: ld.param.u64 %rd7, [test_phi_param_2]; |
| ; PTX_70-NEXT: cvta.to.global.u64 %rd1, %rd7; |
| ; PTX_70-NEXT: mov.b64 %rd10, %rd6; |
| ; PTX_70-NEXT: cvta.param.u64 %rd11, %rd10; |
| ; PTX_70-NEXT: @%p1 bra $L__BB15_2; |
| ; PTX_70-NEXT: // %bb.1: // %second |
| ; PTX_70-NEXT: mov.b64 %rd8, test_phi_param_1; |
| ; PTX_70-NEXT: mov.b64 %rd9, %rd8; |
| ; PTX_70-NEXT: cvta.param.u64 %rd2, %rd9; |
| ; PTX_70-NEXT: add.s64 %rd11, %rd2, 4; |
| ; PTX_70-NEXT: $L__BB15_2: // %merge |
| ; PTX_70-NEXT: ld.u32 %r1, [%rd11]; |
| ; PTX_70-NEXT: st.global.u32 [%rd1], %r1; |
| ; PTX_70-NEXT: ret; |
| bb: |
| br i1 %cond, label %first, label %second |
| |
| first: ; preds = %bb |
| %ptr1 = getelementptr inbounds %struct.S, ptr %input1, i32 0, i32 0 |
| br label %merge |
| |
| second: ; preds = %bb |
| %ptr2 = getelementptr inbounds %struct.S, ptr %input2, i32 0, i32 1 |
| br label %merge |
| |
| merge: ; preds = %second, %first |
| %ptrnew = phi ptr [ %ptr1, %first ], [ %ptr2, %second ] |
| %valloaded = load i32, ptr %ptrnew, align 4 |
| store i32 %valloaded, ptr %inout, align 4 |
| ret void |
| } |
| |
| define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, i1 %cond) { |
| ; COMMON-LABEL: define ptx_kernel void @test_phi_write( |
| ; COMMON-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { |
| ; COMMON-NEXT: [[BB:.*:]] |
| ; COMMON-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 |
| ; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) |
| ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false) |
| ; COMMON-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 |
| ; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) |
| ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false) |
| ; COMMON-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] |
| ; COMMON: [[FIRST]]: |
| ; COMMON-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0 |
| ; COMMON-NEXT: br label %[[MERGE:.*]] |
| ; COMMON: [[SECOND]]: |
| ; COMMON-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1 |
| ; COMMON-NEXT: br label %[[MERGE]] |
| ; COMMON: [[MERGE]]: |
| ; COMMON-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] |
| ; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4 |
| ; COMMON-NEXT: ret void |
| ; |
| ; PTX-LABEL: test_phi_write( |
| ; PTX: { |
| ; PTX-NEXT: .local .align 4 .b8 __local_depot16[8]; |
| ; PTX-NEXT: .reg .b64 %SP; |
| ; PTX-NEXT: .reg .b64 %SPL; |
| ; PTX-NEXT: .reg .pred %p<2>; |
| ; PTX-NEXT: .reg .b16 %rs<3>; |
| ; PTX-NEXT: .reg .b32 %r<4>; |
| ; PTX-NEXT: .reg .b64 %rd<7>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: // %bb |
| ; PTX-NEXT: mov.b64 %SPL, __local_depot16; |
| ; PTX-NEXT: cvta.local.u64 %SP, %SPL; |
| ; PTX-NEXT: ld.param.u8 %rs1, [test_phi_write_param_2]; |
| ; PTX-NEXT: and.b16 %rs2, %rs1, 1; |
| ; PTX-NEXT: setp.eq.b16 %p1, %rs2, 1; |
| ; PTX-NEXT: add.u64 %rd1, %SPL, 0; |
| ; PTX-NEXT: ld.param.u32 %r1, [test_phi_write_param_1+4]; |
| ; PTX-NEXT: st.u32 [%SP], %r1; |
| ; PTX-NEXT: add.u64 %rd6, %SPL, 4; |
| ; PTX-NEXT: ld.param.u32 %r2, [test_phi_write_param_0]; |
| ; PTX-NEXT: st.u32 [%SP+4], %r2; |
| ; PTX-NEXT: @%p1 bra $L__BB16_2; |
| ; PTX-NEXT: // %bb.1: // %second |
| ; PTX-NEXT: mov.b64 %rd6, %rd1; |
| ; PTX-NEXT: $L__BB16_2: // %merge |
| ; PTX-NEXT: mov.b32 %r3, 1; |
| ; PTX-NEXT: st.local.u32 [%rd6], %r3; |
| ; PTX-NEXT: ret; |
| bb: |
| br i1 %cond, label %first, label %second |
| |
| first: ; preds = %bb |
| %ptr1 = getelementptr inbounds %struct.S, ptr %input1, i32 0, i32 0 |
| br label %merge |
| |
| second: ; preds = %bb |
| %ptr2 = getelementptr inbounds %struct.S, ptr %input2, i32 0, i32 1 |
| br label %merge |
| |
| merge: ; preds = %second, %first |
| %ptrnew = phi ptr [ %ptr1, %first ], [ %ptr2, %second ] |
| store i32 1, ptr %ptrnew, align 4 |
| ret void |
| } |
| |
| attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "no-trapping-math"="true" "target-cpu"="sm_60" "target-features"="+ptx78,+sm_60" "uniform-work-group-size"="true" } |
| attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } |
| attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) } |
| |
| !llvm.module.flags = !{!0, !1, !2, !3} |
| !llvm.ident = !{!20, !21} |
| |
| !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 8]} |
| !1 = !{i32 1, !"wchar_size", i32 4} |
| !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} |
| !3 = !{i32 7, !"frame-pointer", i32 2} |
| !20 = !{!"clang version 20.0.0git"} |
| !21 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} |