offload/test/offloading/multiple_reductions.cpp - llvm-project - Git at Google

 // RUN: %libomptarget-compilexx-run-and-check-generic
 // RUN: %libomptarget-compileoptxx-run-and-check-generic
 // UNSUPPORTED: intelgpu

 #include <cassert>
 #include <cstdio>
 #include <cstdlib>
 #include <iostream>

 #include "omp.h"

 #define N 10000

 template <typename T> static void init_data(T *a) {
   for (int i = 0; i < N; ++i)
     a[i] = i;
 }

 template <typename T> void run_type(void) {
   T s1, s2;

   T *in1 = static_cast<T *>(malloc(N * sizeof(T)));
   T *in2 = static_cast<T *>(malloc(N * sizeof(T)));
   assert(in1 && in2);

   init_data(in1);
   init_data(in2);

 #pragma omp target enter data map(to : in1[0 : N], in2[0 : N])

   // Sum reduction
   s1 = T(0);
 #pragma omp target teams distribute parallel for reduction(+ : s1)
   for (int i = 0; i < N; ++i)
     s1 += in1[i];
   // CHECK: 49995000
   std::cout << s1 << '\n';

   s1 = T(0);
   // Indirect sum reduction
   auto accumulate = [](T a, T b) { return a + b; };
 #pragma omp target teams distribute parallel for reduction(+ : s1)
   for (int i = 0; i < N; i++)
     s1 = accumulate(s1, in1[i]);
   // CHECK: 49995000
   std::cout << s1 << '\n';

   // Dot reduction
   s1 = T(0);
 #pragma omp target teams distribute parallel for reduction(+ : s1)
   for (int i = 0; i < N; ++i)
     s1 += in1[i] * in2[i];
   // CHECK: 2570853208
   std::cout << s1 << '\n';

   // Combined reduction (sum and max) - in the same loop ...
   s1 = s2 = T(0);
 #pragma omp target teams distribute parallel for reduction(+ : s1)             \
     reduction(max : s2)
   for (int i = 0; i < N; ++i) {
     s1 += in1[i];
     s2 = in1[i] > s2 ? in1[i] : s2;
   }
   // CHECK: 49995000 : 9999
   std::cout << s1 << " : " << s2 << '\n';

   // ... and in separate loops
   s1 = s2 = T(0);
 #pragma omp target map(tofrom : s1, s2)
 #pragma omp teams reduction(+ : s1) reduction(max : s2)
   {
 #pragma omp distribute parallel for reduction(+ : s1)
     for (int i = 0; i < N; i++)
       s1 += in1[i];

 #pragma omp distribute parallel for reduction(max : s2)
     for (int i = 0; i < N; i++)
       s2 = in1[i] > s2 ? in1[i] : s2;
   }
   // CHECK: 49995000 : 9999
   std::cout << s1 << " : " << s2 << '\n';

   // Reduction in a kernel that is also doing something completely
   // unrelated to the reduction (pure register work, no memory ops).
   s1 = T(0);
 #pragma omp target map(tofrom : s1)
 #pragma omp teams reduction(+ : s1)
   {
 #pragma omp distribute parallel for reduction(+ : s1)
     for (int i = 0; i < N; i++)
       s1 += in1[i];

     // Just do something, without actually doing anything
 #pragma omp parallel
     {
       int x = omp_get_thread_num();
       for (int j = 0; j < 100; j++)
         x = x * 0.9 + j;
       if (x == -1)
         s1 += x;
     }
   }
   // CHECK: 49995000
   std::cout << s1 << '\n';

 #pragma omp target exit data map(delete : in1[0 : N], in2[0 : N])

   free(in1);
   free(in2);
 }

 int main(int argc, char **argv) {
   run_type<double>();
   run_type<unsigned>();
   run_type<unsigned long>();

   // Reduction calculating pi
   double pi = 0.0;
   // https://en.wikipedia.org/wiki/Leibniz_formula_for_%CF%80
 #pragma omp target teams distribute parallel for reduction(+ : pi)
   for (int i = 0; i < N; i++) {
     double term = 1.0 / (2 * i + 1);
     pi += (i & 0x1) ? -term : term;
   }
   // CHECK: 3.141
   printf("%.3f\n", pi * 4.0);

   return EXIT_SUCCESS;
 }
	// RUN: %libomptarget-compilexx-run-and-check-generic
	// RUN: %libomptarget-compileoptxx-run-and-check-generic
	// UNSUPPORTED: intelgpu

	#include <cassert>
	#include <cstdio>
	#include <cstdlib>
	#include <iostream>

	#include "omp.h"

	#define N 10000

	template <typename T> static void init_data(T *a) {
	for (int i = 0; i < N; ++i)
	a[i] = i;
	}

	template <typename T> void run_type(void) {
	T s1, s2;

	T in1 = static_cast<T >(malloc(N * sizeof(T)));
	T in2 = static_cast<T >(malloc(N * sizeof(T)));
	assert(in1 && in2);

	init_data(in1);
	init_data(in2);

	#pragma omp target enter data map(to : in1[0 : N], in2[0 : N])

	// Sum reduction
	s1 = T(0);
	#pragma omp target teams distribute parallel for reduction(+ : s1)
	for (int i = 0; i < N; ++i)
	s1 += in1[i];
	// CHECK: 49995000
	std::cout << s1 << '\n';

	s1 = T(0);
	// Indirect sum reduction
	auto accumulate = [](T a, T b) { return a + b; };
	#pragma omp target teams distribute parallel for reduction(+ : s1)
	for (int i = 0; i < N; i++)
	s1 = accumulate(s1, in1[i]);
	// CHECK: 49995000
	std::cout << s1 << '\n';

	// Dot reduction
	s1 = T(0);
	#pragma omp target teams distribute parallel for reduction(+ : s1)
	for (int i = 0; i < N; ++i)
	s1 += in1[i] * in2[i];
	// CHECK: 2570853208
	std::cout << s1 << '\n';

	// Combined reduction (sum and max) - in the same loop ...
	s1 = s2 = T(0);
	#pragma omp target teams distribute parallel for reduction(+ : s1) \
	reduction(max : s2)
	for (int i = 0; i < N; ++i) {
	s1 += in1[i];
	s2 = in1[i] > s2 ? in1[i] : s2;
	}
	// CHECK: 49995000 : 9999
	std::cout << s1 << " : " << s2 << '\n';

	// ... and in separate loops
	s1 = s2 = T(0);
	#pragma omp target map(tofrom : s1, s2)
	#pragma omp teams reduction(+ : s1) reduction(max : s2)
	{
	#pragma omp distribute parallel for reduction(+ : s1)
	for (int i = 0; i < N; i++)
	s1 += in1[i];

	#pragma omp distribute parallel for reduction(max : s2)
	for (int i = 0; i < N; i++)
	s2 = in1[i] > s2 ? in1[i] : s2;
	}
	// CHECK: 49995000 : 9999
	std::cout << s1 << " : " << s2 << '\n';

	// Reduction in a kernel that is also doing something completely
	// unrelated to the reduction (pure register work, no memory ops).
	s1 = T(0);
	#pragma omp target map(tofrom : s1)
	#pragma omp teams reduction(+ : s1)
	{
	#pragma omp distribute parallel for reduction(+ : s1)
	for (int i = 0; i < N; i++)
	s1 += in1[i];

	// Just do something, without actually doing anything
	#pragma omp parallel
	{
	int x = omp_get_thread_num();
	for (int j = 0; j < 100; j++)
	x = x * 0.9 + j;
	if (x == -1)
	s1 += x;
	}
	}
	// CHECK: 49995000
	std::cout << s1 << '\n';

	#pragma omp target exit data map(delete : in1[0 : N], in2[0 : N])

	free(in1);
	free(in2);
	}

	int main(int argc, char **argv) {
	run_type<double>();
	run_type<unsigned>();
	run_type<unsigned long>();

	// Reduction calculating pi
	double pi = 0.0;
	// https://en.wikipedia.org/wiki/Leibniz_formula_for_%CF%80
	#pragma omp target teams distribute parallel for reduction(+ : pi)
	for (int i = 0; i < N; i++) {
	double term = 1.0 / (2 * i + 1);
	pi += (i & 0x1) ? -term : term;
	}
	// CHECK: 3.141
	printf("%.3f\n", pi * 4.0);

	return EXIT_SUCCESS;
	}