|  | ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 | 
|  | ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments)' -S < %s | FileCheck -check-prefix=NO-PRELOAD %s | 
|  | ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s | FileCheck -check-prefix=PRELOAD %s | 
|  |  | 
|  | define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) { | 
|  | ; NO-PRELOAD-LABEL: define amdgpu_kernel void @preload_block_count_x( | 
|  | ; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { | 
|  | ; NO-PRELOAD-NEXT:    [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() | 
|  | ; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 | 
|  | ; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]] | 
|  | ; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | ; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 | 
|  | ; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 | 
|  | ; NO-PRELOAD-NEXT:    ret void | 
|  | ; | 
|  | ; PRELOAD-LABEL: define amdgpu_kernel void @preload_block_count_x( | 
|  | ; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0:[0-9]+]] { | 
|  | ; PRELOAD-NEXT:    [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() | 
|  | ; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | ; PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 | 
|  | ; PRELOAD-NEXT:    store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4 | 
|  | ; PRELOAD-NEXT:    ret void | 
|  | ; | 
|  | %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | %load = load i32, ptr addrspace(4) %imp_arg_ptr | 
|  | store i32 %load, ptr addrspace(1) %out | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i512) { | 
|  | ; NO-PRELOAD-LABEL: define amdgpu_kernel void @no_free_sgprs_block_count_x( | 
|  | ; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]], i512 [[TMP0:%.*]]) #[[ATTR0]] { | 
|  | ; NO-PRELOAD-NEXT:    [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() | 
|  | ; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 | 
|  | ; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] | 
|  | ; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | ; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 | 
|  | ; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 | 
|  | ; NO-PRELOAD-NEXT:    ret void | 
|  | ; | 
|  | ; PRELOAD-LABEL: define amdgpu_kernel void @no_free_sgprs_block_count_x( | 
|  | ; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i512 [[TMP0:%.*]]) #[[ATTR0]] { | 
|  | ; PRELOAD-NEXT:    [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() | 
|  | ; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | ; PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 | 
|  | ; PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 | 
|  | ; PRELOAD-NEXT:    ret void | 
|  | ; | 
|  | %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | %load = load i32, ptr addrspace(4) %imp_arg_ptr | 
|  | store i32 %load, ptr addrspace(1) %out | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) { | 
|  | ; NO-PRELOAD-LABEL: define amdgpu_kernel void @preloadremainder_z( | 
|  | ; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { | 
|  | ; NO-PRELOAD-NEXT:    [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() | 
|  | ; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOADREMAINDER_Z_KERNARG_SEGMENT]], i64 0 | 
|  | ; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] | 
|  | ; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | ; NO-PRELOAD-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 | 
|  | ; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 | 
|  | ; NO-PRELOAD-NEXT:    [[CONV:%.*]] = zext i16 [[LOAD]] to i32 | 
|  | ; NO-PRELOAD-NEXT:    store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4 | 
|  | ; NO-PRELOAD-NEXT:    ret void | 
|  | ; | 
|  | ; PRELOAD-LABEL: define amdgpu_kernel void @preloadremainder_z( | 
|  | ; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] { | 
|  | ; PRELOAD-NEXT:    [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() | 
|  | ; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | ; PRELOAD-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 | 
|  | ; PRELOAD-NEXT:    [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 | 
|  | ; PRELOAD-NEXT:    [[CONV:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Z]] to i32 | 
|  | ; PRELOAD-NEXT:    store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 | 
|  | ; PRELOAD-NEXT:    ret void | 
|  | ; | 
|  | %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 | 
|  | %load = load i16, ptr addrspace(4) %gep | 
|  | %conv = zext i16 %load to i32 | 
|  | store i32 %conv, ptr addrspace(1) %out | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) { | 
|  | ; NO-PRELOAD-LABEL: define amdgpu_kernel void @preload_workgroup_size_xyz( | 
|  | ; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { | 
|  | ; NO-PRELOAD-NEXT:    [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() | 
|  | ; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT]], i64 0 | 
|  | ; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] | 
|  | ; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | ; NO-PRELOAD-NEXT:    [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12 | 
|  | ; NO-PRELOAD-NEXT:    [[LOAD_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2 | 
|  | ; NO-PRELOAD-NEXT:    [[CONV_X:%.*]] = zext i16 [[LOAD_X]] to i32 | 
|  | ; NO-PRELOAD-NEXT:    [[GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 14 | 
|  | ; NO-PRELOAD-NEXT:    [[LOAD_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2 | 
|  | ; NO-PRELOAD-NEXT:    [[CONV_Y:%.*]] = zext i16 [[LOAD_Y]] to i32 | 
|  | ; NO-PRELOAD-NEXT:    [[GEP_Z:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16 | 
|  | ; NO-PRELOAD-NEXT:    [[LOAD_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 2 | 
|  | ; NO-PRELOAD-NEXT:    [[CONV_Z:%.*]] = zext i16 [[LOAD_Z]] to i32 | 
|  | ; NO-PRELOAD-NEXT:    [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[CONV_X]], i32 0 | 
|  | ; NO-PRELOAD-NEXT:    [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV_Y]], i32 1 | 
|  | ; NO-PRELOAD-NEXT:    [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV_Z]], i32 2 | 
|  | ; NO-PRELOAD-NEXT:    store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT_LOAD]], align 16 | 
|  | ; NO-PRELOAD-NEXT:    ret void | 
|  | ; | 
|  | ; PRELOAD-LABEL: define amdgpu_kernel void @preload_workgroup_size_xyz( | 
|  | ; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]]) #[[ATTR0]] { | 
|  | ; PRELOAD-NEXT:    [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() | 
|  | ; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | ; PRELOAD-NEXT:    [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12 | 
|  | ; PRELOAD-NEXT:    [[LOAD_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2 | 
|  | ; PRELOAD-NEXT:    [[CONV_X:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_X]] to i32 | 
|  | ; PRELOAD-NEXT:    [[GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 14 | 
|  | ; PRELOAD-NEXT:    [[LOAD_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2 | 
|  | ; PRELOAD-NEXT:    [[CONV_Y:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_Y]] to i32 | 
|  | ; PRELOAD-NEXT:    [[GEP_Z:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16 | 
|  | ; PRELOAD-NEXT:    [[LOAD_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 2 | 
|  | ; PRELOAD-NEXT:    [[CONV_Z:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_Z]] to i32 | 
|  | ; PRELOAD-NEXT:    [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[CONV_X]], i32 0 | 
|  | ; PRELOAD-NEXT:    [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV_Y]], i32 1 | 
|  | ; PRELOAD-NEXT:    [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV_Z]], i32 2 | 
|  | ; PRELOAD-NEXT:    store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT]], align 16 | 
|  | ; PRELOAD-NEXT:    ret void | 
|  | ; | 
|  | %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12 | 
|  | %load_x = load i16, ptr addrspace(4) %gep_x | 
|  | %conv_x = zext i16 %load_x to i32 | 
|  | %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14 | 
|  | %load_y = load i16, ptr addrspace(4) %gep_y | 
|  | %conv_y = zext i16 %load_y to i32 | 
|  | %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 | 
|  | %load_z = load i16, ptr addrspace(4) %gep_z | 
|  | %conv_z = zext i16 %load_z to i32 | 
|  | %ins.0 =  insertelement <3 x i32> poison, i32 %conv_x, i32 0 | 
|  | %ins.1 =  insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1 | 
|  | %ins.2 =  insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2 | 
|  | store <3 x i32> %ins.2, ptr addrspace(1) %out | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inreg %out) { | 
|  | ; NO-PRELOAD-LABEL: define amdgpu_kernel void @incorrect_type_i64_block_count_x( | 
|  | ; NO-PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { | 
|  | ; NO-PRELOAD-NEXT:    [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() | 
|  | ; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | ; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8 | 
|  | ; NO-PRELOAD-NEXT:    store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8 | 
|  | ; NO-PRELOAD-NEXT:    ret void | 
|  | ; | 
|  | ; PRELOAD-LABEL: define amdgpu_kernel void @incorrect_type_i64_block_count_x( | 
|  | ; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { | 
|  | ; PRELOAD-NEXT:    [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() | 
|  | ; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | ; PRELOAD-NEXT:    [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8 | 
|  | ; PRELOAD-NEXT:    store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8 | 
|  | ; PRELOAD-NEXT:    ret void | 
|  | ; | 
|  | %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | %load = load i64, ptr addrspace(4) %imp_arg_ptr | 
|  | store i64 %load, ptr addrspace(1) %out | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) { | 
|  | ; NO-PRELOAD-LABEL: define amdgpu_kernel void @random_incorrect_offset( | 
|  | ; NO-PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { | 
|  | ; NO-PRELOAD-NEXT:    [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() | 
|  | ; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | ; NO-PRELOAD-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2 | 
|  | ; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 | 
|  | ; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 | 
|  | ; NO-PRELOAD-NEXT:    ret void | 
|  | ; | 
|  | ; PRELOAD-LABEL: define amdgpu_kernel void @random_incorrect_offset( | 
|  | ; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { | 
|  | ; PRELOAD-NEXT:    [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() | 
|  | ; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | ; PRELOAD-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2 | 
|  | ; PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 | 
|  | ; PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 | 
|  | ; PRELOAD-NEXT:    ret void | 
|  | ; | 
|  | %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2 | 
|  | %load = load i32, ptr addrspace(4) %gep | 
|  | store i32 %load, ptr addrspace(1) %out | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @incompatible_attribute_block_count_x(ptr addrspace(1) byref(i32) %out) { | 
|  | ; NO-PRELOAD-LABEL: define amdgpu_kernel void @incompatible_attribute_block_count_x( | 
|  | ; NO-PRELOAD-SAME: ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0]] { | 
|  | ; NO-PRELOAD-NEXT:    [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() | 
|  | ; NO-PRELOAD-NEXT:    [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 | 
|  | ; NO-PRELOAD-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1) | 
|  | ; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | ; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 | 
|  | ; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4 | 
|  | ; NO-PRELOAD-NEXT:    ret void | 
|  | ; | 
|  | ; PRELOAD-LABEL: define amdgpu_kernel void @incompatible_attribute_block_count_x( | 
|  | ; PRELOAD-SAME: ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0]] { | 
|  | ; PRELOAD-NEXT:    [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() | 
|  | ; PRELOAD-NEXT:    [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 | 
|  | ; PRELOAD-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1) | 
|  | ; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | ; PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 | 
|  | ; PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4 | 
|  | ; PRELOAD-NEXT:    ret void | 
|  | ; | 
|  | %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | %load = load i32, ptr addrspace(4) %imp_arg_ptr | 
|  | store i32 %load, ptr addrspace(1) %out | 
|  | ret void | 
|  | } | 
|  |  | 
|  | ;. | 
|  | ; NO-PRELOAD: [[META0]] = !{} | 
|  | ;. |