test/OpenMP/reduction_implicit_map.cpp - llvm-project/clang - Git at Google

 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ \
 // RUN:  -triple powerpc64le-unknown-unknown -DCUDA \
 // RUN:  -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o \
 // RUN:  %t-ppc-host.bc

 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ \
 // RUN:  -triple nvptx64-unknown-unknown -DCUA \
 // RUN:  -fopenmp-targets=nvptx64-nvidia-cuda -DCUDA -emit-llvm %s \
 // RUN:  -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc \
 // RUN:  -o - | FileCheck %s --check-prefix CHECK

 // RUN: %clang_cc1 -verify -fopenmp -x c++ \
 // RUN:   -triple powerpc64le-unknown-unknown -DDIAG\
 // RUN:   -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm \
 // RUN:   %s -o - | FileCheck  %s \
 // RUN:   --check-prefix=CHECK1

 // RUN: %clang_cc1 -verify -fopenmp -x c++ \
 // RUN:   -triple i386-unknown-unknown \
 // RUN:   -fopenmp-targets=i386-pc-linux-gnu -emit-llvm \
 // RUN:   %s -o - | FileCheck  %s \
 // RUN:   --check-prefix=CHECK2


 #if defined(CUDA)
 // expected-no-diagnostics

 int foo(int n) {
   double *e;
   //no error and no implicit map generated for e[:1]
   #pragma omp target parallel reduction(+: e[:1])
     *e=10;
   ;
   return 0;
 }
 // CHECK-NOT @.offload_maptypes
 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
 #elif defined(DIAG)
 class S2 {
   mutable int a;
 public:
   S2():a(0) { }
   S2(S2 &s2):a(s2.a) { }
   S2 &operator +(S2 &s);
 };
 int bar() {
  S2 o[5];
   //warnig "copyable and not guaranteed to be mapped correctly" and
   //implicit map generated.
 #pragma omp target parallel reduction(+:o[0]) //expected-warning {{Type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; i++);
   double b[10][10][10];
   //no error no implicit map generated, the map for b is generated but not
   //for b[0:2][2:4][1].
 #pragma omp target parallel for reduction(task, +: b[0:2][2:4][1])
   for (long long i = 0; i < 10; ++i);
   return 0;
 }
 // map for variable o
 // CHECK1: offload_sizes = private unnamed_addr constant [1 x i64] [i64 4]
 // CHECK1: offload_maptypes = private unnamed_addr constant [1 x i64] [i64 547]
 // map for b:
 // CHECK1: @.offload_sizes{{.*}} = private unnamed_addr constant [1 x i64] [i64 8000]
 // CHECK1: @.offload_maptypes{{.*}} = private unnamed_addr constant [1 x i64] [i64 547]
 #else
 // expected-no-diagnostics

 // generate implicit map for array elements or array sections in reduction
 // clause. In following case: the implicit map is generate for output[0]
 // with map size 4 and output[:3] with map size 12.
 void sum(int* input, int size, int* output)
 {
 #pragma omp target teams distribute parallel for reduction(+: output[0]) \
                                                  map(to: input [0:size])
   for (int i = 0; i < size; i++)
     output[0] += input[i];
 #pragma omp target teams distribute parallel for reduction(+: output[:3])  \
                                                  map(to: input [0:size])
   for (int i = 0; i < size; i++)
     output[0] += input[i];
   int a[10];
 #pragma omp target parallel reduction(+: a[:2])
   for (int i = 0; i < size; i++)
     ;
 #pragma omp target parallel reduction(+: a[3])
   for (int i = 0; i < size; i++)
     ;
 }
 //CHECK2: @.offload_sizes = private unnamed_addr constant [2 x i64] [i64 4, i64 8]
 //CHECK2: @.offload_maptypes.10 = private unnamed_addr constant [2 x i64] [i64 800, i64 547]
 //CHECK2: @.offload_sizes.13 = private unnamed_addr constant [2 x i64] [i64 4, i64 4]
 //CHECK2: @.offload_maptypes.14 = private unnamed_addr constant [2 x i64] [i64 800, i64 547]
 //CHECK2: define dso_local void @_Z3sumPiiS_
 //CHECK2-NEXT: entry
 //CHECK2-NEXT: [[INP:%.*]] = alloca i32*
 //CHECK2-NEXT: [[SIZE:%.*]] = alloca i32
 //CHECK2-NEXT: [[OUTP:%.*]] = alloca i32*
 //CHECK2:      [[OFFSIZE:%.*]] = alloca [3 x i64]
 //CHECK2:      [[OFFSIZE10:%.*]] = alloca [3 x i64]
 //CHECK2:      [[T15:%.*]] = getelementptr inbounds [3 x i64], [3 x i64]* [[OFFSIZE]], i32 0, i32 0
 //CHECK2-NEXT: store i64 4, i64* [[T15]]
 //CHECK2:      [[T21:%.*]] = getelementptr inbounds [3 x i64], [3 x i64]* [[OFFSIZE]], i32 0, i32 1
 //CHECK2-NEXT: store i64 4, i64* [[T21]]
 //CHECK2:     [[T53:%.*]] = getelementptr inbounds [3 x i64], [3 x i64]* [[OFFSIZE10]], i32 0, i32 0
 //CHECK2-NEXT: store i64 4, i64* [[T53]]
 //CHECK2:     [[T59:%.*]] = getelementptr inbounds [3 x i64], [3 x i64]* [[OFFSIZE10]], i32 0, i32 1
 //CHECK2-NEXT: store i64 12, i64* [[T59]]
 #endif
 int main()
 {
 #if defined(CUDA)
   int a = foo(10);
 #elif defined(DIAG)
   int a = bar();
 #else
   const int size = 100;
   int *array = new int[size];
   int result = 0;
   sum(array, size, &result);
 #endif
   return 0;
 }
	// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ \
	// RUN: -triple powerpc64le-unknown-unknown -DCUDA \
	// RUN: -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o \
	// RUN: %t-ppc-host.bc

	// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ \
	// RUN: -triple nvptx64-unknown-unknown -DCUA \
	// RUN: -fopenmp-targets=nvptx64-nvidia-cuda -DCUDA -emit-llvm %s \
	// RUN: -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc \
	// RUN: -o - \| FileCheck %s --check-prefix CHECK

	// RUN: %clang_cc1 -verify -fopenmp -x c++ \
	// RUN: -triple powerpc64le-unknown-unknown -DDIAG\
	// RUN: -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm \
	// RUN: %s -o - \| FileCheck %s \
	// RUN: --check-prefix=CHECK1

	// RUN: %clang_cc1 -verify -fopenmp -x c++ \
	// RUN: -triple i386-unknown-unknown \
	// RUN: -fopenmp-targets=i386-pc-linux-gnu -emit-llvm \
	// RUN: %s -o - \| FileCheck %s \
	// RUN: --check-prefix=CHECK2


	#if defined(CUDA)
	// expected-no-diagnostics

	int foo(int n) {
	double *e;
	//no error and no implicit map generated for e[:1]
	#pragma omp target parallel reduction(+: e[:1])
	*e=10;
	;
	return 0;
	}
	// CHECK-NOT @.offload_maptypes
	// CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
	#elif defined(DIAG)
	class S2 {
	mutable int a;
	public:
	S2():a(0) { }
	S2(S2 &s2):a(s2.a) { }
	S2 &operator +(S2 &s);
	};
	int bar() {
	S2 o[5];
	//warnig "copyable and not guaranteed to be mapped correctly" and
	//implicit map generated.
	#pragma omp target parallel reduction(+:o[0]) //expected-warning {{Type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}}
	for (int i = 0; i < 10; i++);
	double b[10][10][10];
	//no error no implicit map generated, the map for b is generated but not
	//for b[0:2][2:4][1].
	#pragma omp target parallel for reduction(task, +: b[0:2][2:4][1])
	for (long long i = 0; i < 10; ++i);
	return 0;
	}
	// map for variable o
	// CHECK1: offload_sizes = private unnamed_addr constant [1 x i64] [i64 4]
	// CHECK1: offload_maptypes = private unnamed_addr constant [1 x i64] [i64 547]
	// map for b:
	// CHECK1: @.offload_sizes{{.*}} = private unnamed_addr constant [1 x i64] [i64 8000]
	// CHECK1: @.offload_maptypes{{.*}} = private unnamed_addr constant [1 x i64] [i64 547]
	#else
	// expected-no-diagnostics

	// generate implicit map for array elements or array sections in reduction
	// clause. In following case: the implicit map is generate for output[0]
	// with map size 4 and output[:3] with map size 12.
	void sum(int* input, int size, int* output)
	{
	#pragma omp target teams distribute parallel for reduction(+: output[0]) \
	map(to: input [0:size])
	for (int i = 0; i < size; i++)
	output[0] += input[i];
	#pragma omp target teams distribute parallel for reduction(+: output[:3]) \
	map(to: input [0:size])
	for (int i = 0; i < size; i++)
	output[0] += input[i];
	int a[10];
	#pragma omp target parallel reduction(+: a[:2])
	for (int i = 0; i < size; i++)
	;
	#pragma omp target parallel reduction(+: a[3])
	for (int i = 0; i < size; i++)
	;
	}
	//CHECK2: @.offload_sizes = private unnamed_addr constant [2 x i64] [i64 4, i64 8]
	//CHECK2: @.offload_maptypes.10 = private unnamed_addr constant [2 x i64] [i64 800, i64 547]
	//CHECK2: @.offload_sizes.13 = private unnamed_addr constant [2 x i64] [i64 4, i64 4]
	//CHECK2: @.offload_maptypes.14 = private unnamed_addr constant [2 x i64] [i64 800, i64 547]
	//CHECK2: define dso_local void @_Z3sumPiiS_
	//CHECK2-NEXT: entry
	//CHECK2-NEXT: [[INP:%.]] = alloca i32
	//CHECK2-NEXT: [[SIZE:%.*]] = alloca i32
	//CHECK2-NEXT: [[OUTP:%.]] = alloca i32
	//CHECK2: [[OFFSIZE:%.*]] = alloca [3 x i64]
	//CHECK2: [[OFFSIZE10:%.*]] = alloca [3 x i64]
	//CHECK2: [[T15:%.]] = getelementptr inbounds [3 x i64], [3 x i64] [[OFFSIZE]], i32 0, i32 0
	//CHECK2-NEXT: store i64 4, i64* [[T15]]
	//CHECK2: [[T21:%.]] = getelementptr inbounds [3 x i64], [3 x i64] [[OFFSIZE]], i32 0, i32 1
	//CHECK2-NEXT: store i64 4, i64* [[T21]]
	//CHECK2: [[T53:%.]] = getelementptr inbounds [3 x i64], [3 x i64] [[OFFSIZE10]], i32 0, i32 0
	//CHECK2-NEXT: store i64 4, i64* [[T53]]
	//CHECK2: [[T59:%.]] = getelementptr inbounds [3 x i64], [3 x i64] [[OFFSIZE10]], i32 0, i32 1
	//CHECK2-NEXT: store i64 12, i64* [[T59]]
	#endif
	int main()
	{
	#if defined(CUDA)
	int a = foo(10);
	#elif defined(DIAG)
	int a = bar();
	#else
	const int size = 100;
	int *array = new int[size];
	int result = 0;
	sum(array, size, &result);
	#endif
	return 0;
	}