blob: e7422872eab242e827d473649a87448806bbbc1f [file] [log] [blame]
// -*- C++ -*-
// -*-===----------------------------------------------------------------------===//
//
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
//
//===----------------------------------------------------------------------===//
#ifndef _PSTL_INTERNAL_OMP_PARALLEL_TRANSFORM_REDUCE_H
#define _PSTL_INTERNAL_OMP_PARALLEL_TRANSFORM_REDUCE_H
#include "util.h"
namespace __pstl
{
namespace __omp_backend
{
//------------------------------------------------------------------------
// parallel_transform_reduce
//
// Notation:
// r(i,j,init) returns reduction of init with reduction over [i,j)
// u(i) returns f(i,i+1,identity) for a hypothetical left identity element
// of r c(x,y) combines values x and y that were the result of r or u
//------------------------------------------------------------------------
template <class _RandomAccessIterator, class _UnaryOp, class _Value, class _Combiner, class _Reduction>
auto
__transform_reduce_body(_RandomAccessIterator __first, _RandomAccessIterator __last, _UnaryOp __unary_op, _Value __init,
_Combiner __combiner, _Reduction __reduction)
{
const std::size_t __num_threads = omp_get_num_threads();
const std::size_t __size = __last - __first;
// Initial partition of the iteration space into chunks. If the range is too small,
// this will result in a nonsense policy, so we check on the size as well below.
auto __policy = __omp_backend::__chunk_partitioner(__first + __num_threads, __last);
if (__size <= __num_threads || __policy.__n_chunks < 2)
{
return __reduction(__first, __last, __init);
}
// Here, we cannot use OpenMP UDR because we must store the init value in
// the combiner and it will be used several times. Although there should be
// the only one; we manually generate the identity elements for each thread.
std::vector<_Value> __accums;
__accums.reserve(__num_threads);
// initialize accumulators for all threads
for (std::size_t __i = 0; __i < __num_threads; ++__i)
{
__accums.emplace_back(__unary_op(__first + __i));
}
// main loop
_PSTL_PRAGMA(omp taskloop shared(__accums))
for (std::size_t __chunk = 0; __chunk < __policy.__n_chunks; ++__chunk)
{
__pstl::__omp_backend::__process_chunk(__policy, __first + __num_threads, __chunk,
[&](auto __chunk_first, auto __chunk_last)
{
auto __thread_num = omp_get_thread_num();
__accums[__thread_num] =
__reduction(__chunk_first, __chunk_last, __accums[__thread_num]);
});
}
// combine by accumulators
for (std::size_t __i = 0; __i < __num_threads; ++__i)
{
__init = __combiner(__init, __accums[__i]);
}
return __init;
}
template <class _ExecutionPolicy, class _RandomAccessIterator, class _UnaryOp, class _Value, class _Combiner,
class _Reduction>
_Value
__parallel_transform_reduce(_ExecutionPolicy&&, _RandomAccessIterator __first, _RandomAccessIterator __last,
_UnaryOp __unary_op, _Value __init, _Combiner __combiner, _Reduction __reduction)
{
_Value __result = __init;
if (omp_in_parallel())
{
// We don't create a nested parallel region in an existing parallel
// region: just create tasks
__result = __pstl::__omp_backend::__transform_reduce_body(__first, __last, __unary_op, __init, __combiner,
__reduction);
}
else
{
// Create a parallel region, and a single thread will create tasks
// for the region.
_PSTL_PRAGMA(omp parallel)
_PSTL_PRAGMA(omp single nowait)
{
__result = __pstl::__omp_backend::__transform_reduce_body(__first, __last, __unary_op, __init, __combiner,
__reduction);
}
}
return __result;
}
} // namespace __omp_backend
} // namespace __pstl
#endif // _PSTL_INTERNAL_OMP_PARALLEL_TRANSFORM_REDUCE_H