test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu - llvm-project/clang - Git at Google

 // REQUIRES: x86-registered-target
 // REQUIRES: amdgpu-registered-target

 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -o - | FileCheck --check-prefixes=COMMON,CHECK %s
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -disable-O0-optnone -o - | opt -S -O2 | FileCheck %s --check-prefixes=COMMON,OPT
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -x hip %s -o - | FileCheck -check-prefix=HOST %s

 #include "Inputs/cuda.h"

 // Coerced struct from `struct S` without all generic pointers lowered into
 // global ones.

 // On the host-side compilation, generic pointer won't be coerced.
 // HOST-NOT: %struct.S.coerce
 // HOST-NOT: %struct.T.coerce

 // HOST: define{{.*}} void @_Z22__device_stub__kernel1Pi(ptr noundef %x)
 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel1Pi(ptr addrspace(1){{.*}} %x.coerce)
 // CHECK:     ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
 // CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
 // OPT: [[VAL:%.*]] = load i32, ptr addrspace(1) %x.coerce, align 4, !amdgpu.noclobber ![[MD:[0-9]+]]
 // OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1
 // OPT: store i32 [[INC]], ptr addrspace(1) %x.coerce, align 4
 // OPT: ret void
 __global__ void kernel1(int *x) {
   x[0]++;
 }

 // HOST: define{{.*}} void @_Z22__device_stub__kernel2Ri(ptr noundef nonnull align 4 dereferenceable(4) %x)
 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel2Ri(ptr addrspace(1){{.*}} nonnull align 4 dereferenceable(4) %x.coerce)
 // CHECK:     ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
 // CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
 // OPT: [[VAL:%.*]] = load i32, ptr addrspace(1) %x.coerce, align 4, !amdgpu.noclobber ![[MD]]
 // OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1
 // OPT: store i32 [[INC]], ptr addrspace(1) %x.coerce, align 4
 // OPT: ret void
 __global__ void kernel2(int &x) {
   x++;
 }

 // HOST: define{{.*}} void @_Z22__device_stub__kernel3PU3AS2iPU3AS1i(ptr addrspace(2)  noundef %x, ptr addrspace(1) noundef %y)
 // CHECK-LABEL: define{{.*}} amdgpu_kernel void  @_Z7kernel3PU3AS2iPU3AS1i(ptr addrspace(2){{.*}} %x, ptr addrspace(1){{.*}} %y)
 // CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
 __global__ void kernel3(__attribute__((address_space(2))) int *x,
                         __attribute__((address_space(1))) int *y) {
   y[0] = x[0];
 }

 // COMMON-LABEL: define{{.*}} void @_Z4funcPi(ptr{{.*}} %x)
 // CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
 __device__ void func(int *x) {
   x[0]++;
 }

 struct S {
   int *x;
   float *y;
 };
 // `by-val` struct is passed by-indirect-alias (a mix of by-ref and indirect
 // by-val). However, the enhanced address inferring pass should be able to
 // assume they are global pointers.
 //
 // HOST: define{{.*}} void @_Z22__device_stub__kernel41S(ptr %s.coerce0, ptr %s.coerce1)
 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel41S(ptr addrspace(4){{.*}} byref(%struct.S) align 8 %0)
 // OPT: [[P0:%.*]] = load ptr, ptr addrspace(4) %0, align 8
 // OPT: [[G0:%.*]] ={{.*}} addrspacecast ptr [[P0]] to ptr addrspace(1)
 // OPT: [[R1:%.*]] = getelementptr inbounds i8, ptr addrspace(4) %0, i64 8
 // OPT: [[P1:%.*]] = load ptr, ptr addrspace(4) [[R1]], align 8
 // OPT: [[G1:%.*]] ={{.*}} addrspacecast ptr [[P1]] to ptr addrspace(1)
 // OPT: [[V0:%.*]] = load i32, ptr addrspace(1) [[G0]], align 4, !amdgpu.noclobber ![[MD]]
 // OPT: [[INC:%.*]] = add nsw i32 [[V0]], 1
 // OPT: store i32 [[INC]], ptr addrspace(1) [[G0]], align 4
 // OPT: [[V1:%.*]] = load float, ptr addrspace(1) [[G1]], align 4
 // OPT: [[ADD:%.*]] = fadd contract float [[V1]], 1.000000e+00
 // OPT: store float [[ADD]], ptr addrspace(1) [[G1]], align 4
 // OPT: ret void
 __global__ void kernel4(struct S s) {
   s.x[0]++;
   s.y[0] += 1.f;
 }

 // If a pointer to struct is passed, only the pointer itself is coerced into the global one.
 // HOST: define{{.*}} void @_Z22__device_stub__kernel5P1S(ptr noundef %s)
 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel5P1S(ptr addrspace(1){{.*}} %s.coerce)
 __global__ void kernel5(struct S *s) {
   s->x[0]++;
   s->y[0] += 1.f;
 }

 struct T {
   float *x[2];
 };
 // `by-val` array is passed by-indirect-alias (a mix of by-ref and indirect
 // by-val). However, the enhanced address inferring pass should be able to
 // assume they are global pointers.
 //
 // HOST: define{{.*}} void @_Z22__device_stub__kernel61T(ptr %t.coerce0, ptr %t.coerce1)
 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel61T(ptr addrspace(4){{.*}} byref(%struct.T) align 8 %0)
 // OPT: [[P0:%.*]] = load ptr, ptr addrspace(4) %0, align 8
 // OPT: [[G0:%.*]] ={{.*}} addrspacecast ptr [[P0]] to ptr addrspace(1)
 // OPT: [[R1:%.*]] = getelementptr inbounds i8, ptr addrspace(4) %0, i64 8
 // OPT: [[P1:%.*]] = load ptr, ptr addrspace(4) [[R1]], align 8
 // OPT: [[G1:%.*]] ={{.*}} addrspacecast ptr [[P1]] to ptr addrspace(1)
 // OPT: [[V0:%.*]] = load float, ptr addrspace(1) [[G0]], align 4, !amdgpu.noclobber ![[MD]]
 // OPT: [[ADD0:%.*]] = fadd contract float [[V0]], 1.000000e+00
 // OPT: store float [[ADD0]], ptr addrspace(1) [[G0]], align 4
 // OPT: [[V1:%.*]] = load float, ptr addrspace(1) [[G1]], align 4
 // OPT: [[ADD1:%.*]] = fadd contract float [[V1]], 2.000000e+00
 // OPT: store float [[ADD1]], ptr addrspace(1) [[G1]], align 4
 // OPT: ret void
 __global__ void kernel6(struct T t) {
   t.x[0][0] += 1.f;
   t.x[1][0] += 2.f;
 }

 // Check that coerced pointers retain the noalias attribute when qualified with __restrict.
 // HOST: define{{.*}} void @_Z22__device_stub__kernel7Pi(ptr noalias noundef %x)
 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel7Pi(ptr addrspace(1) noalias{{.*}} %x.coerce)
 __global__ void kernel7(int *__restrict x) {
   x[0]++;
 }

 // Single element struct.
 struct SS {
   float *x;
 };
 // HOST: define{{.*}} void @_Z22__device_stub__kernel82SS(ptr %a.coerce)
 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel82SS(ptr addrspace(1){{.*}} %a.coerce)
 // CHECK:     ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
 // CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
 // OPT: [[VAL:%.*]] = load float, ptr addrspace(1) %a.coerce, align 4, !amdgpu.noclobber ![[MD]]
 // OPT: [[INC:%.*]] = fadd contract float [[VAL]], 3.000000e+00
 // OPT: store float [[INC]], ptr addrspace(1) %a.coerce, align 4
 // OPT: ret void
 __global__ void kernel8(struct SS a) {
   *a.x += 3.f;
 }
	// REQUIRES: x86-registered-target
	// REQUIRES: amdgpu-registered-target

	// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -o - \| FileCheck --check-prefixes=COMMON,CHECK %s
	// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -disable-O0-optnone -o - \| opt -S -O2 \| FileCheck %s --check-prefixes=COMMON,OPT
	// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -x hip %s -o - \| FileCheck -check-prefix=HOST %s

	#include "Inputs/cuda.h"

	// Coerced struct from `struct S` without all generic pointers lowered into
	// global ones.

	// On the host-side compilation, generic pointer won't be coerced.
	// HOST-NOT: %struct.S.coerce
	// HOST-NOT: %struct.T.coerce

	// HOST: define{{.*}} void @_Z22__device_stub__kernel1Pi(ptr noundef %x)
	// COMMON-LABEL: define{{.}} amdgpu_kernel void @_Z7kernel1Pi(ptr addrspace(1){{.}} %x.coerce)
	// CHECK: ={{.}} addrspacecast ptr addrspace(1) %{{.}} to ptr
	// CHECK-NOT: ={{.}} addrspacecast ptr addrspace(1) %{{.}} to ptr
	// OPT: [[VAL:%.*]] = load i32, ptr addrspace(1) %x.coerce, align 4, !amdgpu.noclobber ![[MD:[0-9]+]]
	// OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1
	// OPT: store i32 [[INC]], ptr addrspace(1) %x.coerce, align 4
	// OPT: ret void
	__global__ void kernel1(int *x) {
	x[0]++;
	}

	// HOST: define{{.*}} void @_Z22__device_stub__kernel2Ri(ptr noundef nonnull align 4 dereferenceable(4) %x)
	// COMMON-LABEL: define{{.}} amdgpu_kernel void @_Z7kernel2Ri(ptr addrspace(1){{.}} nonnull align 4 dereferenceable(4) %x.coerce)
	// CHECK: ={{.}} addrspacecast ptr addrspace(1) %{{.}} to ptr
	// CHECK-NOT: ={{.}} addrspacecast ptr addrspace(1) %{{.}} to ptr
	// OPT: [[VAL:%.*]] = load i32, ptr addrspace(1) %x.coerce, align 4, !amdgpu.noclobber ![[MD]]
	// OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1
	// OPT: store i32 [[INC]], ptr addrspace(1) %x.coerce, align 4
	// OPT: ret void
	__global__ void kernel2(int &x) {
	x++;
	}

	// HOST: define{{.*}} void @_Z22__device_stub__kernel3PU3AS2iPU3AS1i(ptr addrspace(2) noundef %x, ptr addrspace(1) noundef %y)
	// CHECK-LABEL: define{{.}} amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i(ptr addrspace(2){{.}} %x, ptr addrspace(1){{.*}} %y)
	// CHECK-NOT: ={{.}} addrspacecast ptr addrspace(1) %{{.}} to ptr
	__global__ void kernel3(__attribute__((address_space(2))) int *x,
	__attribute__((address_space(1))) int *y) {
	y[0] = x[0];
	}

	// COMMON-LABEL: define{{.}} void @_Z4funcPi(ptr{{.}} %x)
	// CHECK-NOT: ={{.}} addrspacecast ptr addrspace(1) %{{.}} to ptr
	__device__ void func(int *x) {
	x[0]++;
	}

	struct S {
	int *x;
	float *y;
	};
	// `by-val` struct is passed by-indirect-alias (a mix of by-ref and indirect
	// by-val). However, the enhanced address inferring pass should be able to
	// assume they are global pointers.
	//
	// HOST: define{{.*}} void @_Z22__device_stub__kernel41S(ptr %s.coerce0, ptr %s.coerce1)
	// COMMON-LABEL: define{{.}} amdgpu_kernel void @_Z7kernel41S(ptr addrspace(4){{.}} byref(%struct.S) align 8 %0)
	// OPT: [[P0:%.*]] = load ptr, ptr addrspace(4) %0, align 8
	// OPT: [[G0:%.]] ={{.}} addrspacecast ptr [[P0]] to ptr addrspace(1)
	// OPT: [[R1:%.*]] = getelementptr inbounds i8, ptr addrspace(4) %0, i64 8
	// OPT: [[P1:%.*]] = load ptr, ptr addrspace(4) [[R1]], align 8
	// OPT: [[G1:%.]] ={{.}} addrspacecast ptr [[P1]] to ptr addrspace(1)
	// OPT: [[V0:%.*]] = load i32, ptr addrspace(1) [[G0]], align 4, !amdgpu.noclobber ![[MD]]
	// OPT: [[INC:%.*]] = add nsw i32 [[V0]], 1
	// OPT: store i32 [[INC]], ptr addrspace(1) [[G0]], align 4
	// OPT: [[V1:%.*]] = load float, ptr addrspace(1) [[G1]], align 4
	// OPT: [[ADD:%.*]] = fadd contract float [[V1]], 1.000000e+00
	// OPT: store float [[ADD]], ptr addrspace(1) [[G1]], align 4
	// OPT: ret void
	__global__ void kernel4(struct S s) {
	s.x[0]++;
	s.y[0] += 1.f;
	}

	// If a pointer to struct is passed, only the pointer itself is coerced into the global one.
	// HOST: define{{.*}} void @_Z22__device_stub__kernel5P1S(ptr noundef %s)
	// COMMON-LABEL: define{{.}} amdgpu_kernel void @_Z7kernel5P1S(ptr addrspace(1){{.}} %s.coerce)
	__global__ void kernel5(struct S *s) {
	s->x[0]++;
	s->y[0] += 1.f;
	}

	struct T {
	float *x[2];
	};
	// `by-val` array is passed by-indirect-alias (a mix of by-ref and indirect
	// by-val). However, the enhanced address inferring pass should be able to
	// assume they are global pointers.
	//
	// HOST: define{{.*}} void @_Z22__device_stub__kernel61T(ptr %t.coerce0, ptr %t.coerce1)
	// COMMON-LABEL: define{{.}} amdgpu_kernel void @_Z7kernel61T(ptr addrspace(4){{.}} byref(%struct.T) align 8 %0)
	// OPT: [[P0:%.*]] = load ptr, ptr addrspace(4) %0, align 8
	// OPT: [[G0:%.]] ={{.}} addrspacecast ptr [[P0]] to ptr addrspace(1)
	// OPT: [[R1:%.*]] = getelementptr inbounds i8, ptr addrspace(4) %0, i64 8
	// OPT: [[P1:%.*]] = load ptr, ptr addrspace(4) [[R1]], align 8
	// OPT: [[G1:%.]] ={{.}} addrspacecast ptr [[P1]] to ptr addrspace(1)
	// OPT: [[V0:%.*]] = load float, ptr addrspace(1) [[G0]], align 4, !amdgpu.noclobber ![[MD]]
	// OPT: [[ADD0:%.*]] = fadd contract float [[V0]], 1.000000e+00
	// OPT: store float [[ADD0]], ptr addrspace(1) [[G0]], align 4
	// OPT: [[V1:%.*]] = load float, ptr addrspace(1) [[G1]], align 4
	// OPT: [[ADD1:%.*]] = fadd contract float [[V1]], 2.000000e+00
	// OPT: store float [[ADD1]], ptr addrspace(1) [[G1]], align 4
	// OPT: ret void
	__global__ void kernel6(struct T t) {
	t.x[0][0] += 1.f;
	t.x[1][0] += 2.f;
	}

	// Check that coerced pointers retain the noalias attribute when qualified with __restrict.
	// HOST: define{{.*}} void @_Z22__device_stub__kernel7Pi(ptr noalias noundef %x)
	// COMMON-LABEL: define{{.}} amdgpu_kernel void @_Z7kernel7Pi(ptr addrspace(1) noalias{{.}} %x.coerce)
	__global__ void kernel7(int *__restrict x) {
	x[0]++;
	}

	// Single element struct.
	struct SS {
	float *x;
	};
	// HOST: define{{.*}} void @_Z22__device_stub__kernel82SS(ptr %a.coerce)
	// COMMON-LABEL: define{{.}} amdgpu_kernel void @_Z7kernel82SS(ptr addrspace(1){{.}} %a.coerce)
	// CHECK: ={{.}} addrspacecast ptr addrspace(1) %{{.}} to ptr
	// CHECK-NOT: ={{.}} addrspacecast ptr addrspace(1) %{{.}} to ptr
	// OPT: [[VAL:%.*]] = load float, ptr addrspace(1) %a.coerce, align 4, !amdgpu.noclobber ![[MD]]
	// OPT: [[INC:%.*]] = fadd contract float [[VAL]], 3.000000e+00
	// OPT: store float [[INC]], ptr addrspace(1) %a.coerce, align 4
	// OPT: ret void
	__global__ void kernel8(struct SS a) {
	*a.x += 3.f;
	}