blob: 3187076d06f6c6b2f03bb4706f87d602a32ce4d3 [file]
// RUN: %libomptarget-compilexx-run-and-check-generic
// RUN: %libomptarget-compileoptxx-run-and-check-generic
// UNSUPPORTED: intelgpu
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include "omp.h"
#define N 10000
template <typename T> static void init_data(T *a) {
for (int i = 0; i < N; ++i)
a[i] = i;
}
template <typename T> void run_type(void) {
T s1, s2;
T *in1 = static_cast<T *>(malloc(N * sizeof(T)));
T *in2 = static_cast<T *>(malloc(N * sizeof(T)));
assert(in1 && in2);
init_data(in1);
init_data(in2);
#pragma omp target enter data map(to : in1[0 : N], in2[0 : N])
// Sum reduction
s1 = T(0);
#pragma omp target teams distribute parallel for reduction(+ : s1)
for (int i = 0; i < N; ++i)
s1 += in1[i];
// CHECK: 49995000
std::cout << s1 << '\n';
s1 = T(0);
// Indirect sum reduction
auto accumulate = [](T a, T b) { return a + b; };
#pragma omp target teams distribute parallel for reduction(+ : s1)
for (int i = 0; i < N; i++)
s1 = accumulate(s1, in1[i]);
// CHECK: 49995000
std::cout << s1 << '\n';
// Dot reduction
s1 = T(0);
#pragma omp target teams distribute parallel for reduction(+ : s1)
for (int i = 0; i < N; ++i)
s1 += in1[i] * in2[i];
// CHECK: 2570853208
std::cout << s1 << '\n';
// Combined reduction (sum and max) - in the same loop ...
s1 = s2 = T(0);
#pragma omp target teams distribute parallel for reduction(+ : s1) \
reduction(max : s2)
for (int i = 0; i < N; ++i) {
s1 += in1[i];
s2 = in1[i] > s2 ? in1[i] : s2;
}
// CHECK: 49995000 : 9999
std::cout << s1 << " : " << s2 << '\n';
// ... and in separate loops
s1 = s2 = T(0);
#pragma omp target map(tofrom : s1, s2)
#pragma omp teams reduction(+ : s1) reduction(max : s2)
{
#pragma omp distribute parallel for reduction(+ : s1)
for (int i = 0; i < N; i++)
s1 += in1[i];
#pragma omp distribute parallel for reduction(max : s2)
for (int i = 0; i < N; i++)
s2 = in1[i] > s2 ? in1[i] : s2;
}
// CHECK: 49995000 : 9999
std::cout << s1 << " : " << s2 << '\n';
// Reduction in a kernel that is also doing something completely
// unrelated to the reduction (pure register work, no memory ops).
s1 = T(0);
#pragma omp target map(tofrom : s1)
#pragma omp teams reduction(+ : s1)
{
#pragma omp distribute parallel for reduction(+ : s1)
for (int i = 0; i < N; i++)
s1 += in1[i];
// Just do something, without actually doing anything
#pragma omp parallel
{
int x = omp_get_thread_num();
for (int j = 0; j < 100; j++)
x = x * 0.9 + j;
if (x == -1)
s1 += x;
}
}
// CHECK: 49995000
std::cout << s1 << '\n';
#pragma omp target exit data map(delete : in1[0 : N], in2[0 : N])
free(in1);
free(in2);
}
int main(int argc, char **argv) {
run_type<double>();
run_type<unsigned>();
run_type<unsigned long>();
// Reduction calculating pi
double pi = 0.0;
// https://en.wikipedia.org/wiki/Leibniz_formula_for_%CF%80
#pragma omp target teams distribute parallel for reduction(+ : pi)
for (int i = 0; i < N; i++) {
double term = 1.0 / (2 * i + 1);
pi += (i & 0x1) ? -term : term;
}
// CHECK: 3.141
printf("%.3f\n", pi * 4.0);
return EXIT_SUCCESS;
}