|  | /* | 
|  | * kmp_collapse.cpp -- loop collapse feature | 
|  | */ | 
|  |  | 
|  | //===----------------------------------------------------------------------===// | 
|  | // | 
|  | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|  | // See https://llvm.org/LICENSE.txt for license information. | 
|  | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  |  | 
|  | #include "kmp.h" | 
|  | #include "kmp_error.h" | 
|  | #include "kmp_i18n.h" | 
|  | #include "kmp_itt.h" | 
|  | #include "kmp_stats.h" | 
|  | #include "kmp_str.h" | 
|  | #include "kmp_collapse.h" | 
|  |  | 
|  | #if OMPT_SUPPORT | 
|  | #include "ompt-specific.h" | 
|  | #endif | 
|  |  | 
|  | // OMPTODO: different style of comments (see kmp_sched) | 
|  | // OMPTODO: OMPT/OMPD | 
|  |  | 
|  | // avoid inadevertently using a library based abs | 
|  | template <typename T> T __kmp_abs(const T val) { | 
|  | return (val < 0) ? -val : val; | 
|  | } | 
|  | kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; } | 
|  | kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; } | 
|  |  | 
|  | //---------------------------------------------------------------------------- | 
|  | // Common functions for working with rectangular and non-rectangular loops | 
|  | //---------------------------------------------------------------------------- | 
|  |  | 
|  | template <typename T> int __kmp_sign(T val) { | 
|  | return (T(0) < val) - (val < T(0)); | 
|  | } | 
|  |  | 
|  | template <typename T> class CollapseAllocator { | 
|  | typedef T *pT; | 
|  |  | 
|  | private: | 
|  | static const size_t allocaSize = 32; // size limit for stack allocations | 
|  | // (8 bytes x 4 nested loops) | 
|  | char stackAlloc[allocaSize]; | 
|  | static constexpr size_t maxElemCount = allocaSize / sizeof(T); | 
|  | pT pTAlloc; | 
|  |  | 
|  | public: | 
|  | CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) { | 
|  | if (n > maxElemCount) { | 
|  | pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T))); | 
|  | } | 
|  | } | 
|  | ~CollapseAllocator() { | 
|  | if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) { | 
|  | __kmp_free(pTAlloc); | 
|  | } | 
|  | } | 
|  | T &operator[](int index) { return pTAlloc[index]; } | 
|  | operator const pT() { return pTAlloc; } | 
|  | }; | 
|  |  | 
|  | //----------Loop canonicalization--------------------------------------------- | 
|  |  | 
|  | // For loop nest (any shape): | 
|  | // convert != to < or >; | 
|  | // switch from using < or > to <= or >=. | 
|  | // "bounds" array has to be allocated per thread. | 
|  | // All other internal functions will work only with canonicalized loops. | 
|  | template <typename T> | 
|  | void kmp_canonicalize_one_loop_XX( | 
|  | ident_t *loc, | 
|  | /*in/out*/ bounds_infoXX_template<T> *bounds) { | 
|  |  | 
|  | if (__kmp_env_consistency_check) { | 
|  | if (bounds->step == 0) { | 
|  | __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, | 
|  | loc); | 
|  | } | 
|  | } | 
|  |  | 
|  | if (bounds->comparison == comparison_t::comp_not_eq) { | 
|  | // We can convert this to < or >, depends on the sign of the step: | 
|  | if (bounds->step > 0) { | 
|  | bounds->comparison = comparison_t::comp_less; | 
|  | } else { | 
|  | bounds->comparison = comparison_t::comp_greater; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (bounds->comparison == comparison_t::comp_less) { | 
|  | // Note: ub0 can be unsigned. Should be Ok to hit overflow here, | 
|  | // because ub0 + ub1*j should be still positive (otherwise loop was not | 
|  | // well formed) | 
|  | bounds->ub0 -= 1; | 
|  | bounds->comparison = comparison_t::comp_less_or_eq; | 
|  | } else if (bounds->comparison == comparison_t::comp_greater) { | 
|  | bounds->ub0 += 1; | 
|  | bounds->comparison = comparison_t::comp_greater_or_eq; | 
|  | } | 
|  | } | 
|  |  | 
|  | // Canonicalize loop nest. original_bounds_nest is an array of length n. | 
|  | void kmp_canonicalize_loop_nest(ident_t *loc, | 
|  | /*in/out*/ bounds_info_t *original_bounds_nest, | 
|  | kmp_index_t n) { | 
|  |  | 
|  | for (kmp_index_t ind = 0; ind < n; ++ind) { | 
|  | auto bounds = &(original_bounds_nest[ind]); | 
|  |  | 
|  | switch (bounds->loop_type) { | 
|  | case loop_type_t::loop_type_int32: | 
|  | kmp_canonicalize_one_loop_XX<kmp_int32>( | 
|  | loc, | 
|  | /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds)); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint32: | 
|  | kmp_canonicalize_one_loop_XX<kmp_uint32>( | 
|  | loc, | 
|  | /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds)); | 
|  | break; | 
|  | case loop_type_t::loop_type_int64: | 
|  | kmp_canonicalize_one_loop_XX<kmp_int64>( | 
|  | loc, | 
|  | /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds)); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint64: | 
|  | kmp_canonicalize_one_loop_XX<kmp_uint64>( | 
|  | loc, | 
|  | /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds)); | 
|  | break; | 
|  | default: | 
|  | KMP_ASSERT(false); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | //----------Calculating trip count on one level------------------------------- | 
|  |  | 
|  | // Calculate trip count on this loop level. | 
|  | // We do this either for a rectangular loop nest, | 
|  | // or after an adjustment bringing the loops to a parallelepiped shape. | 
|  | // This number should not depend on the value of outer IV | 
|  | // even if the formular has lb1 and ub1. | 
|  | // Note: for non-rectangular loops don't use span for this, it's too big. | 
|  |  | 
|  | template <typename T> | 
|  | kmp_loop_nest_iv_t kmp_calculate_trip_count_XX( | 
|  | /*in/out*/ bounds_infoXX_template<T> *bounds) { | 
|  |  | 
|  | if (bounds->comparison == comparison_t::comp_less_or_eq) { | 
|  | if (bounds->ub0 < bounds->lb0) { | 
|  | // Note: after this we don't need to calculate inner loops, | 
|  | // but that should be an edge case: | 
|  | bounds->trip_count = 0; | 
|  | } else { | 
|  | // ub - lb may exceed signed type range; we need to cast to | 
|  | // kmp_loop_nest_iv_t anyway | 
|  | bounds->trip_count = | 
|  | static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) / | 
|  | __kmp_abs(bounds->step) + | 
|  | 1; | 
|  | } | 
|  | } else if (bounds->comparison == comparison_t::comp_greater_or_eq) { | 
|  | if (bounds->lb0 < bounds->ub0) { | 
|  | // Note: after this we don't need to calculate inner loops, | 
|  | // but that should be an edge case: | 
|  | bounds->trip_count = 0; | 
|  | } else { | 
|  | // lb - ub may exceed signed type range; we need to cast to | 
|  | // kmp_loop_nest_iv_t anyway | 
|  | bounds->trip_count = | 
|  | static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) / | 
|  | __kmp_abs(bounds->step) + | 
|  | 1; | 
|  | } | 
|  | } else { | 
|  | KMP_ASSERT(false); | 
|  | } | 
|  | return bounds->trip_count; | 
|  | } | 
|  |  | 
|  | // Calculate trip count on this loop level. | 
|  | kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) { | 
|  |  | 
|  | kmp_loop_nest_iv_t trip_count = 0; | 
|  |  | 
|  | switch (bounds->loop_type) { | 
|  | case loop_type_t::loop_type_int32: | 
|  | trip_count = kmp_calculate_trip_count_XX<kmp_int32>( | 
|  | /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds)); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint32: | 
|  | trip_count = kmp_calculate_trip_count_XX<kmp_uint32>( | 
|  | /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds)); | 
|  | break; | 
|  | case loop_type_t::loop_type_int64: | 
|  | trip_count = kmp_calculate_trip_count_XX<kmp_int64>( | 
|  | /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds)); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint64: | 
|  | trip_count = kmp_calculate_trip_count_XX<kmp_uint64>( | 
|  | /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds)); | 
|  | break; | 
|  | default: | 
|  | KMP_ASSERT(false); | 
|  | } | 
|  |  | 
|  | return trip_count; | 
|  | } | 
|  |  | 
|  | //----------Trim original iv according to its type---------------------------- | 
|  |  | 
|  | // Trim original iv according to its type. | 
|  | // Return kmp_uint64 value which can be easily used in all internal calculations | 
|  | // And can be statically cast back to original type in user code. | 
|  | kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) { | 
|  | kmp_uint64 res = 0; | 
|  |  | 
|  | switch (loop_iv_type) { | 
|  | case loop_type_t::loop_type_int8: | 
|  | res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv)); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint8: | 
|  | res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv)); | 
|  | break; | 
|  | case loop_type_t::loop_type_int16: | 
|  | res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv)); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint16: | 
|  | res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv)); | 
|  | break; | 
|  | case loop_type_t::loop_type_int32: | 
|  | res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv)); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint32: | 
|  | res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv)); | 
|  | break; | 
|  | case loop_type_t::loop_type_int64: | 
|  | res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv)); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint64: | 
|  | res = static_cast<kmp_uint64>(original_iv); | 
|  | break; | 
|  | default: | 
|  | KMP_ASSERT(false); | 
|  | } | 
|  |  | 
|  | return res; | 
|  | } | 
|  |  | 
|  | //----------Compare two IVs (remember they have a type)----------------------- | 
|  |  | 
|  | bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1, | 
|  | kmp_uint64 original_iv2) { | 
|  | bool res = false; | 
|  |  | 
|  | switch (loop_iv_type) { | 
|  | case loop_type_t::loop_type_int8: | 
|  | res = static_cast<kmp_int8>(original_iv1) == | 
|  | static_cast<kmp_int8>(original_iv2); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint8: | 
|  | res = static_cast<kmp_uint8>(original_iv1) == | 
|  | static_cast<kmp_uint8>(original_iv2); | 
|  | break; | 
|  | case loop_type_t::loop_type_int16: | 
|  | res = static_cast<kmp_int16>(original_iv1) == | 
|  | static_cast<kmp_int16>(original_iv2); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint16: | 
|  | res = static_cast<kmp_uint16>(original_iv1) == | 
|  | static_cast<kmp_uint16>(original_iv2); | 
|  | break; | 
|  | case loop_type_t::loop_type_int32: | 
|  | res = static_cast<kmp_int32>(original_iv1) == | 
|  | static_cast<kmp_int32>(original_iv2); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint32: | 
|  | res = static_cast<kmp_uint32>(original_iv1) == | 
|  | static_cast<kmp_uint32>(original_iv2); | 
|  | break; | 
|  | case loop_type_t::loop_type_int64: | 
|  | res = static_cast<kmp_int64>(original_iv1) == | 
|  | static_cast<kmp_int64>(original_iv2); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint64: | 
|  | res = static_cast<kmp_uint64>(original_iv1) == | 
|  | static_cast<kmp_uint64>(original_iv2); | 
|  | break; | 
|  | default: | 
|  | KMP_ASSERT(false); | 
|  | } | 
|  |  | 
|  | return res; | 
|  | } | 
|  |  | 
|  | //----------Calculate original iv on one level-------------------------------- | 
|  |  | 
|  | // Return true if the point fits into upper bounds on this level, | 
|  | // false otherwise | 
|  | template <typename T> | 
|  | bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds, | 
|  | const kmp_point_t original_ivs, | 
|  | kmp_index_t ind) { | 
|  |  | 
|  | T iv = static_cast<T>(original_ivs[ind]); | 
|  | T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]); | 
|  |  | 
|  | if (((bounds->comparison == comparison_t::comp_less_or_eq) && | 
|  | (iv > (bounds->ub0 + bounds->ub1 * outer_iv))) || | 
|  | ((bounds->comparison == comparison_t::comp_greater_or_eq) && | 
|  | (iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) { | 
|  | // The calculated point is outside of loop upper boundary: | 
|  | return false; | 
|  | } | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | // Calculate one iv corresponding to iteration on the level ind. | 
|  | // Return true if it fits into lower-upper bounds on this level | 
|  | // (if not, we need to re-calculate) | 
|  | template <typename T> | 
|  | bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds, | 
|  | /*in/out*/ kmp_point_t original_ivs, | 
|  | const kmp_iterations_t iterations, kmp_index_t ind, | 
|  | bool start_with_lower_bound, bool checkBounds) { | 
|  |  | 
|  | kmp_uint64 temp = 0; | 
|  | T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]); | 
|  |  | 
|  | if (start_with_lower_bound) { | 
|  | // we moved to the next iteration on one of outer loops, should start | 
|  | // with the lower bound here: | 
|  | temp = bounds->lb0 + bounds->lb1 * outer_iv; | 
|  | } else { | 
|  | auto iteration = iterations[ind]; | 
|  | temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step; | 
|  | } | 
|  |  | 
|  | // Now trim original iv according to its type: | 
|  | original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp); | 
|  |  | 
|  | if (checkBounds) { | 
|  | return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind); | 
|  | } else { | 
|  | return true; | 
|  | } | 
|  | } | 
|  |  | 
|  | bool kmp_calc_one_iv(const bounds_info_t *bounds, | 
|  | /*in/out*/ kmp_point_t original_ivs, | 
|  | const kmp_iterations_t iterations, kmp_index_t ind, | 
|  | bool start_with_lower_bound, bool checkBounds) { | 
|  |  | 
|  | switch (bounds->loop_type) { | 
|  | case loop_type_t::loop_type_int32: | 
|  | return kmp_calc_one_iv_XX<kmp_int32>( | 
|  | (bounds_infoXX_template<kmp_int32> *)(bounds), | 
|  | /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, | 
|  | checkBounds); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint32: | 
|  | return kmp_calc_one_iv_XX<kmp_uint32>( | 
|  | (bounds_infoXX_template<kmp_uint32> *)(bounds), | 
|  | /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, | 
|  | checkBounds); | 
|  | break; | 
|  | case loop_type_t::loop_type_int64: | 
|  | return kmp_calc_one_iv_XX<kmp_int64>( | 
|  | (bounds_infoXX_template<kmp_int64> *)(bounds), | 
|  | /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, | 
|  | checkBounds); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint64: | 
|  | return kmp_calc_one_iv_XX<kmp_uint64>( | 
|  | (bounds_infoXX_template<kmp_uint64> *)(bounds), | 
|  | /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, | 
|  | checkBounds); | 
|  | break; | 
|  | default: | 
|  | KMP_ASSERT(false); | 
|  | return false; | 
|  | } | 
|  | } | 
|  |  | 
|  | //----------Calculate original iv on one level for rectangular loop nest------ | 
|  |  | 
|  | // Calculate one iv corresponding to iteration on the level ind. | 
|  | // Return true if it fits into lower-upper bounds on this level | 
|  | // (if not, we need to re-calculate) | 
|  | template <typename T> | 
|  | void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds, | 
|  | /*in/out*/ kmp_uint64 *original_ivs, | 
|  | const kmp_iterations_t iterations, | 
|  | kmp_index_t ind) { | 
|  |  | 
|  | auto iteration = iterations[ind]; | 
|  |  | 
|  | kmp_uint64 temp = | 
|  | bounds->lb0 + | 
|  | bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) + | 
|  | iteration * bounds->step; | 
|  |  | 
|  | // Now trim original iv according to its type: | 
|  | original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp); | 
|  | } | 
|  |  | 
|  | void kmp_calc_one_iv_rectang(const bounds_info_t *bounds, | 
|  | /*in/out*/ kmp_uint64 *original_ivs, | 
|  | const kmp_iterations_t iterations, | 
|  | kmp_index_t ind) { | 
|  |  | 
|  | switch (bounds->loop_type) { | 
|  | case loop_type_t::loop_type_int32: | 
|  | kmp_calc_one_iv_rectang_XX<kmp_int32>( | 
|  | (bounds_infoXX_template<kmp_int32> *)(bounds), | 
|  | /*in/out*/ original_ivs, iterations, ind); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint32: | 
|  | kmp_calc_one_iv_rectang_XX<kmp_uint32>( | 
|  | (bounds_infoXX_template<kmp_uint32> *)(bounds), | 
|  | /*in/out*/ original_ivs, iterations, ind); | 
|  | break; | 
|  | case loop_type_t::loop_type_int64: | 
|  | kmp_calc_one_iv_rectang_XX<kmp_int64>( | 
|  | (bounds_infoXX_template<kmp_int64> *)(bounds), | 
|  | /*in/out*/ original_ivs, iterations, ind); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint64: | 
|  | kmp_calc_one_iv_rectang_XX<kmp_uint64>( | 
|  | (bounds_infoXX_template<kmp_uint64> *)(bounds), | 
|  | /*in/out*/ original_ivs, iterations, ind); | 
|  | break; | 
|  | default: | 
|  | KMP_ASSERT(false); | 
|  | } | 
|  | } | 
|  |  | 
|  | //---------------------------------------------------------------------------- | 
|  | // Rectangular loop nest | 
|  | //---------------------------------------------------------------------------- | 
|  |  | 
|  | //----------Canonicalize loop nest and calculate trip count------------------- | 
|  |  | 
|  | // Canonicalize loop nest and calculate overall trip count. | 
|  | // "bounds_nest" has to be allocated per thread. | 
|  | // API will modify original bounds_nest array to bring it to a canonical form | 
|  | // (only <= and >=, no !=, <, >). If the original loop nest was already in a | 
|  | // canonical form there will be no changes to bounds in bounds_nest array | 
|  | // (only trip counts will be calculated). | 
|  | // Returns trip count of overall space. | 
|  | extern "C" kmp_loop_nest_iv_t | 
|  | __kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid, | 
|  | /*in/out*/ bounds_info_t *original_bounds_nest, | 
|  | kmp_index_t n) { | 
|  |  | 
|  | kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n); | 
|  |  | 
|  | kmp_loop_nest_iv_t total = 1; | 
|  |  | 
|  | for (kmp_index_t ind = 0; ind < n; ++ind) { | 
|  | auto bounds = &(original_bounds_nest[ind]); | 
|  |  | 
|  | kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds); | 
|  | total *= trip_count; | 
|  | } | 
|  |  | 
|  | return total; | 
|  | } | 
|  |  | 
|  | //----------Calculate old induction variables--------------------------------- | 
|  |  | 
|  | // Calculate old induction variables corresponding to overall new_iv. | 
|  | // Note: original IV will be returned as if it had kmp_uint64 type, | 
|  | // will have to be converted to original type in user code. | 
|  | // Note: trip counts should be already calculated by | 
|  | // __kmpc_process_loop_nest_rectang. | 
|  | // OMPTODO: special case 2, 3 nested loops: either do different | 
|  | // interface without array or possibly template this over n | 
|  | extern "C" void | 
|  | __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv, | 
|  | const bounds_info_t *original_bounds_nest, | 
|  | /*out*/ kmp_uint64 *original_ivs, | 
|  | kmp_index_t n) { | 
|  |  | 
|  | CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); | 
|  |  | 
|  | // First, calc corresponding iteration in every original loop: | 
|  | for (kmp_index_t ind = n; ind > 0;) { | 
|  | --ind; | 
|  | auto bounds = &(original_bounds_nest[ind]); | 
|  |  | 
|  | // should be optimized to OPDIVREM: | 
|  | auto temp = new_iv / bounds->trip_count; | 
|  | auto iteration = new_iv % bounds->trip_count; | 
|  | new_iv = temp; | 
|  |  | 
|  | iterations[ind] = iteration; | 
|  | } | 
|  | KMP_ASSERT(new_iv == 0); | 
|  |  | 
|  | for (kmp_index_t ind = 0; ind < n; ++ind) { | 
|  | auto bounds = &(original_bounds_nest[ind]); | 
|  |  | 
|  | kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind); | 
|  | } | 
|  | } | 
|  |  | 
|  | //---------------------------------------------------------------------------- | 
|  | // Non-rectangular loop nest | 
|  | //---------------------------------------------------------------------------- | 
|  |  | 
|  | //----------Calculate maximum possible span of iv values on one level--------- | 
|  |  | 
|  | // Calculate span for IV on this loop level for "<=" case. | 
|  | // Note: it's for <= on this loop nest level, so lower bound should be smallest | 
|  | // value, upper bound should be the biggest value. If the loop won't execute, | 
|  | // 'smallest' may be bigger than 'biggest', but we'd better not switch them | 
|  | // around. | 
|  | template <typename T> | 
|  | void kmp_calc_span_lessoreq_XX( | 
|  | /* in/out*/ bounds_info_internalXX_template<T> *bounds, | 
|  | /* in/out*/ bounds_info_internal_t *bounds_nest) { | 
|  |  | 
|  | typedef typename traits_t<T>::unsigned_t UT; | 
|  | // typedef typename traits_t<T>::signed_t ST; | 
|  |  | 
|  | // typedef typename big_span_t span_t; | 
|  | typedef T span_t; | 
|  |  | 
|  | auto &bbounds = bounds->b; | 
|  |  | 
|  | if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) { | 
|  | // This dimention depends on one of previous ones; can't be the outermost | 
|  | // one. | 
|  | bounds_info_internalXX_template<T> *previous = | 
|  | reinterpret_cast<bounds_info_internalXX_template<T> *>( | 
|  | &(bounds_nest[bbounds.outer_iv])); | 
|  |  | 
|  | // OMPTODO: assert that T is compatible with loop variable type on | 
|  | // 'previous' loop | 
|  |  | 
|  | { | 
|  | span_t bound_candidate1 = | 
|  | bbounds.lb0 + bbounds.lb1 * previous->span_smallest; | 
|  | span_t bound_candidate2 = | 
|  | bbounds.lb0 + bbounds.lb1 * previous->span_biggest; | 
|  | if (bound_candidate1 < bound_candidate2) { | 
|  | bounds->span_smallest = bound_candidate1; | 
|  | } else { | 
|  | bounds->span_smallest = bound_candidate2; | 
|  | } | 
|  | } | 
|  |  | 
|  | { | 
|  | // We can't adjust the upper bound with respect to step, because | 
|  | // lower bound might be off after adjustments | 
|  |  | 
|  | span_t bound_candidate1 = | 
|  | bbounds.ub0 + bbounds.ub1 * previous->span_smallest; | 
|  | span_t bound_candidate2 = | 
|  | bbounds.ub0 + bbounds.ub1 * previous->span_biggest; | 
|  | if (bound_candidate1 < bound_candidate2) { | 
|  | bounds->span_biggest = bound_candidate2; | 
|  | } else { | 
|  | bounds->span_biggest = bound_candidate1; | 
|  | } | 
|  | } | 
|  | } else { | 
|  | // Rectangular: | 
|  | bounds->span_smallest = bbounds.lb0; | 
|  | bounds->span_biggest = bbounds.ub0; | 
|  | } | 
|  | if (!bounds->loop_bounds_adjusted) { | 
|  | // Here it's safe to reduce the space to the multiply of step. | 
|  | // OMPTODO: check if the formular is correct. | 
|  | // Also check if it would be safe to do this if we didn't adjust left side. | 
|  | bounds->span_biggest -= | 
|  | (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs? | 
|  | } | 
|  | } | 
|  |  | 
|  | // Calculate span for IV on this loop level for ">=" case. | 
|  | template <typename T> | 
|  | void kmp_calc_span_greateroreq_XX( | 
|  | /* in/out*/ bounds_info_internalXX_template<T> *bounds, | 
|  | /* in/out*/ bounds_info_internal_t *bounds_nest) { | 
|  |  | 
|  | typedef typename traits_t<T>::unsigned_t UT; | 
|  | // typedef typename traits_t<T>::signed_t ST; | 
|  |  | 
|  | // typedef typename big_span_t span_t; | 
|  | typedef T span_t; | 
|  |  | 
|  | auto &bbounds = bounds->b; | 
|  |  | 
|  | if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) { | 
|  | // This dimention depends on one of previous ones; can't be the outermost | 
|  | // one. | 
|  | bounds_info_internalXX_template<T> *previous = | 
|  | reinterpret_cast<bounds_info_internalXX_template<T> *>( | 
|  | &(bounds_nest[bbounds.outer_iv])); | 
|  |  | 
|  | // OMPTODO: assert that T is compatible with loop variable type on | 
|  | // 'previous' loop | 
|  |  | 
|  | { | 
|  | span_t bound_candidate1 = | 
|  | bbounds.lb0 + bbounds.lb1 * previous->span_smallest; | 
|  | span_t bound_candidate2 = | 
|  | bbounds.lb0 + bbounds.lb1 * previous->span_biggest; | 
|  | if (bound_candidate1 >= bound_candidate2) { | 
|  | bounds->span_smallest = bound_candidate1; | 
|  | } else { | 
|  | bounds->span_smallest = bound_candidate2; | 
|  | } | 
|  | } | 
|  |  | 
|  | { | 
|  | // We can't adjust the upper bound with respect to step, because | 
|  | // lower bound might be off after adjustments | 
|  |  | 
|  | span_t bound_candidate1 = | 
|  | bbounds.ub0 + bbounds.ub1 * previous->span_smallest; | 
|  | span_t bound_candidate2 = | 
|  | bbounds.ub0 + bbounds.ub1 * previous->span_biggest; | 
|  | if (bound_candidate1 >= bound_candidate2) { | 
|  | bounds->span_biggest = bound_candidate2; | 
|  | } else { | 
|  | bounds->span_biggest = bound_candidate1; | 
|  | } | 
|  | } | 
|  |  | 
|  | } else { | 
|  | // Rectangular: | 
|  | bounds->span_biggest = bbounds.lb0; | 
|  | bounds->span_smallest = bbounds.ub0; | 
|  | } | 
|  | if (!bounds->loop_bounds_adjusted) { | 
|  | // Here it's safe to reduce the space to the multiply of step. | 
|  | // OMPTODO: check if the formular is correct. | 
|  | // Also check if it would be safe to do this if we didn't adjust left side. | 
|  | bounds->span_biggest -= | 
|  | (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs? | 
|  | } | 
|  | } | 
|  |  | 
|  | // Calculate maximum possible span for IV on this loop level. | 
|  | template <typename T> | 
|  | void kmp_calc_span_XX( | 
|  | /* in/out*/ bounds_info_internalXX_template<T> *bounds, | 
|  | /* in/out*/ bounds_info_internal_t *bounds_nest) { | 
|  |  | 
|  | if (bounds->b.comparison == comparison_t::comp_less_or_eq) { | 
|  | kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); | 
|  | } else { | 
|  | KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq); | 
|  | kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); | 
|  | } | 
|  | } | 
|  |  | 
|  | //----------All initial processing of the loop nest--------------------------- | 
|  |  | 
|  | // Calculate new bounds for this loop level. | 
|  | // To be able to work with the nest we need to get it to a parallelepiped shape. | 
|  | // We need to stay in the original range of values, so that there will be no | 
|  | // overflow, for that we'll adjust both upper and lower bounds as needed. | 
|  | template <typename T> | 
|  | void kmp_calc_new_bounds_XX( | 
|  | /* in/out*/ bounds_info_internalXX_template<T> *bounds, | 
|  | /* in/out*/ bounds_info_internal_t *bounds_nest) { | 
|  |  | 
|  | auto &bbounds = bounds->b; | 
|  |  | 
|  | if (bbounds.lb1 == bbounds.ub1) { | 
|  | // Already parallel, no need to adjust: | 
|  | bounds->loop_bounds_adjusted = false; | 
|  | } else { | 
|  | bounds->loop_bounds_adjusted = true; | 
|  |  | 
|  | T old_lb1 = bbounds.lb1; | 
|  | T old_ub1 = bbounds.ub1; | 
|  |  | 
|  | if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) { | 
|  | // With this shape we can adjust to a rectangle: | 
|  | bbounds.lb1 = 0; | 
|  | bbounds.ub1 = 0; | 
|  | } else { | 
|  | // get upper and lower bounds to be parallel | 
|  | // with values in the old range. | 
|  | // Note: abs didn't work here. | 
|  | if (((old_lb1 < 0) && (old_lb1 < old_ub1)) || | 
|  | ((old_lb1 > 0) && (old_lb1 > old_ub1))) { | 
|  | bbounds.lb1 = old_ub1; | 
|  | } else { | 
|  | bbounds.ub1 = old_lb1; | 
|  | } | 
|  | } | 
|  |  | 
|  | // Now need to adjust lb0, ub0, otherwise in some cases space will shrink. | 
|  | // The idea here that for this IV we are now getting the same span | 
|  | // irrespective of the previous IV value. | 
|  | bounds_info_internalXX_template<T> *previous = | 
|  | reinterpret_cast<bounds_info_internalXX_template<T> *>( | 
|  | &bounds_nest[bbounds.outer_iv]); | 
|  |  | 
|  | if (bbounds.comparison == comparison_t::comp_less_or_eq) { | 
|  | if (old_lb1 < bbounds.lb1) { | 
|  | KMP_ASSERT(old_lb1 < 0); | 
|  | // The length is good on outer_iv biggest number, | 
|  | // can use it to find where to move the lower bound: | 
|  |  | 
|  | T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest; | 
|  | bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space? | 
|  | // e.g. it was 0?? (same below) | 
|  | } else if (old_lb1 > bbounds.lb1) { | 
|  | // still need to move lower bound: | 
|  | T add = (old_lb1 - bbounds.lb1) * previous->span_smallest; | 
|  | bbounds.lb0 += add; | 
|  | } | 
|  |  | 
|  | if (old_ub1 > bbounds.ub1) { | 
|  | KMP_ASSERT(old_ub1 > 0); | 
|  | // The length is good on outer_iv biggest number, | 
|  | // can use it to find where to move upper bound: | 
|  |  | 
|  | T add = (old_ub1 - bbounds.ub1) * previous->span_biggest; | 
|  | bbounds.ub0 += add; | 
|  | } else if (old_ub1 < bbounds.ub1) { | 
|  | // still need to move upper bound: | 
|  | T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest; | 
|  | bbounds.ub0 -= sub; | 
|  | } | 
|  | } else { | 
|  | KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq); | 
|  | if (old_lb1 < bbounds.lb1) { | 
|  | KMP_ASSERT(old_lb1 < 0); | 
|  | T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest; | 
|  | bbounds.lb0 -= sub; | 
|  | } else if (old_lb1 > bbounds.lb1) { | 
|  | T add = (old_lb1 - bbounds.lb1) * previous->span_biggest; | 
|  | bbounds.lb0 += add; | 
|  | } | 
|  |  | 
|  | if (old_ub1 > bbounds.ub1) { | 
|  | KMP_ASSERT(old_ub1 > 0); | 
|  | T add = (old_ub1 - bbounds.ub1) * previous->span_smallest; | 
|  | bbounds.ub0 += add; | 
|  | } else if (old_ub1 < bbounds.ub1) { | 
|  | T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest; | 
|  | bbounds.ub0 -= sub; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // Do all processing for one canonicalized loop in the nest | 
|  | // (assuming that outer loops already were processed): | 
|  | template <typename T> | 
|  | kmp_loop_nest_iv_t kmp_process_one_loop_XX( | 
|  | /* in/out*/ bounds_info_internalXX_template<T> *bounds, | 
|  | /*in/out*/ bounds_info_internal_t *bounds_nest) { | 
|  |  | 
|  | kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); | 
|  | kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); | 
|  | return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b)); | 
|  | } | 
|  |  | 
|  | // Non-rectangular loop nest, canonicalized to use <= or >=. | 
|  | // Process loop nest to have a parallelepiped shape, | 
|  | // calculate biggest spans for IV's on all levels and calculate overall trip | 
|  | // count. "bounds_nest" has to be allocated per thread. | 
|  | // Returns overall trip count (for adjusted space). | 
|  | kmp_loop_nest_iv_t kmp_process_loop_nest( | 
|  | /*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) { | 
|  |  | 
|  | kmp_loop_nest_iv_t total = 1; | 
|  |  | 
|  | for (kmp_index_t ind = 0; ind < n; ++ind) { | 
|  | auto bounds = &(bounds_nest[ind]); | 
|  | kmp_loop_nest_iv_t trip_count = 0; | 
|  |  | 
|  | switch (bounds->b.loop_type) { | 
|  | case loop_type_t::loop_type_int32: | 
|  | trip_count = kmp_process_one_loop_XX<kmp_int32>( | 
|  | /*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds), | 
|  | /*in/out*/ bounds_nest); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint32: | 
|  | trip_count = kmp_process_one_loop_XX<kmp_uint32>( | 
|  | /*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds), | 
|  | /*in/out*/ bounds_nest); | 
|  | break; | 
|  | case loop_type_t::loop_type_int64: | 
|  | trip_count = kmp_process_one_loop_XX<kmp_int64>( | 
|  | /*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds), | 
|  | /*in/out*/ bounds_nest); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint64: | 
|  | trip_count = kmp_process_one_loop_XX<kmp_uint64>( | 
|  | /*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds), | 
|  | /*in/out*/ bounds_nest); | 
|  | break; | 
|  | default: | 
|  | KMP_ASSERT(false); | 
|  | } | 
|  | total *= trip_count; | 
|  | } | 
|  |  | 
|  | return total; | 
|  | } | 
|  |  | 
|  | //----------Calculate iterations (in the original or updated space)----------- | 
|  |  | 
|  | // Calculate number of iterations in original or updated space resulting in | 
|  | // original_ivs[ind] (only on this level, non-negative) | 
|  | // (not counting initial iteration) | 
|  | template <typename T> | 
|  | kmp_loop_nest_iv_t | 
|  | kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds, | 
|  | const kmp_point_t original_ivs, | 
|  | kmp_index_t ind) { | 
|  |  | 
|  | kmp_loop_nest_iv_t iterations = 0; | 
|  |  | 
|  | if (bounds->comparison == comparison_t::comp_less_or_eq) { | 
|  | iterations = | 
|  | (static_cast<T>(original_ivs[ind]) - bounds->lb0 - | 
|  | bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) / | 
|  | __kmp_abs(bounds->step); | 
|  | } else { | 
|  | KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq); | 
|  | iterations = (bounds->lb0 + | 
|  | bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) - | 
|  | static_cast<T>(original_ivs[ind])) / | 
|  | __kmp_abs(bounds->step); | 
|  | } | 
|  |  | 
|  | return iterations; | 
|  | } | 
|  |  | 
|  | // Calculate number of iterations in the original or updated space resulting in | 
|  | // original_ivs[ind] (only on this level, non-negative) | 
|  | kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds, | 
|  | const kmp_point_t original_ivs, | 
|  | kmp_index_t ind) { | 
|  |  | 
|  | switch (bounds->loop_type) { | 
|  | case loop_type_t::loop_type_int32: | 
|  | return kmp_calc_number_of_iterations_XX<kmp_int32>( | 
|  | (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint32: | 
|  | return kmp_calc_number_of_iterations_XX<kmp_uint32>( | 
|  | (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind); | 
|  | break; | 
|  | case loop_type_t::loop_type_int64: | 
|  | return kmp_calc_number_of_iterations_XX<kmp_int64>( | 
|  | (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint64: | 
|  | return kmp_calc_number_of_iterations_XX<kmp_uint64>( | 
|  | (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind); | 
|  | break; | 
|  | default: | 
|  | KMP_ASSERT(false); | 
|  | return 0; | 
|  | } | 
|  | } | 
|  |  | 
|  | //----------Calculate new iv corresponding to original ivs-------------------- | 
|  |  | 
|  | // We got a point in the original loop nest. | 
|  | // Take updated bounds and calculate what new_iv will correspond to this point. | 
|  | // When we are getting original IVs from new_iv, we have to adjust to fit into | 
|  | // original loops bounds. Getting new_iv for the adjusted original IVs will help | 
|  | // with making more chunks non-empty. | 
|  | kmp_loop_nest_iv_t | 
|  | kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest, | 
|  | const kmp_point_t original_ivs, | 
|  | kmp_index_t n) { | 
|  |  | 
|  | kmp_loop_nest_iv_t new_iv = 0; | 
|  |  | 
|  | for (kmp_index_t ind = 0; ind < n; ++ind) { | 
|  | auto bounds = &(bounds_nest[ind].b); | 
|  |  | 
|  | new_iv = new_iv * bounds->trip_count + | 
|  | kmp_calc_number_of_iterations(bounds, original_ivs, ind); | 
|  | } | 
|  |  | 
|  | return new_iv; | 
|  | } | 
|  |  | 
|  | //----------Calculate original ivs for provided iterations-------------------- | 
|  |  | 
|  | // Calculate original IVs for provided iterations, assuming iterations are | 
|  | // calculated in the original space. | 
|  | // Loop nest is in canonical form (with <= / >=). | 
|  | bool kmp_calc_original_ivs_from_iterations( | 
|  | const bounds_info_t *original_bounds_nest, kmp_index_t n, | 
|  | /*in/out*/ kmp_point_t original_ivs, | 
|  | /*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) { | 
|  |  | 
|  | kmp_index_t lengthened_ind = n; | 
|  |  | 
|  | for (; ind < n;) { | 
|  | auto bounds = &(original_bounds_nest[ind]); | 
|  | bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations, | 
|  | ind, (lengthened_ind < ind), true); | 
|  |  | 
|  | if (!good) { | 
|  | // The calculated iv value is too big (or too small for >=): | 
|  | if (ind == 0) { | 
|  | // Space is empty: | 
|  | return false; | 
|  | } else { | 
|  | // Go to next iteration on the outer loop: | 
|  | --ind; | 
|  | ++iterations[ind]; | 
|  | lengthened_ind = ind; | 
|  | for (kmp_index_t i = ind + 1; i < n; ++i) { | 
|  | iterations[i] = 0; | 
|  | } | 
|  | continue; | 
|  | } | 
|  | } | 
|  | ++ind; | 
|  | } | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | //----------Calculate original ivs for the beginning of the loop nest--------- | 
|  |  | 
|  | // Calculate IVs for the beginning of the loop nest. | 
|  | // Note: lower bounds of all loops may not work - | 
|  | // if on some of the iterations of the outer loops inner loops are empty. | 
|  | // Loop nest is in canonical form (with <= / >=). | 
|  | bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest, | 
|  | kmp_index_t n, | 
|  | /*out*/ kmp_point_t original_ivs) { | 
|  |  | 
|  | // Iterations in the original space, multiplied by step: | 
|  | CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); | 
|  | for (kmp_index_t ind = n; ind > 0;) { | 
|  | --ind; | 
|  | iterations[ind] = 0; | 
|  | } | 
|  |  | 
|  | // Now calculate the point: | 
|  | bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n, | 
|  | /*in/out*/ original_ivs, | 
|  | /*in/out*/ iterations, 0); | 
|  | return b; | 
|  | } | 
|  |  | 
|  | //----------Calculate next point in the original loop space------------------- | 
|  |  | 
|  | // From current set of original IVs calculate next point. | 
|  | // Return false if there is no next point in the loop bounds. | 
|  | bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest, | 
|  | kmp_index_t n, const kmp_point_t original_ivs, | 
|  | /*out*/ kmp_point_t next_original_ivs) { | 
|  | // Iterations in the original space, multiplied by step (so can be negative): | 
|  | CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); | 
|  | // First, calc corresponding iteration in every original loop: | 
|  | for (kmp_index_t ind = 0; ind < n; ++ind) { | 
|  | auto bounds = &(original_bounds_nest[ind]); | 
|  | iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind); | 
|  | } | 
|  |  | 
|  | for (kmp_index_t ind = 0; ind < n; ++ind) { | 
|  | next_original_ivs[ind] = original_ivs[ind]; | 
|  | } | 
|  |  | 
|  | // Next add one step to the iterations on the inner-most level, and see if we | 
|  | // need to move up the nest: | 
|  | kmp_index_t ind = n - 1; | 
|  | ++iterations[ind]; | 
|  |  | 
|  | bool b = kmp_calc_original_ivs_from_iterations( | 
|  | original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind); | 
|  |  | 
|  | return b; | 
|  | } | 
|  |  | 
|  | //----------Calculate chunk end in the original loop space-------------------- | 
|  |  | 
|  | // For one level calculate old induction variable corresponding to overall | 
|  | // new_iv for the chunk end. | 
|  | // Return true if it fits into upper bound on this level | 
|  | // (if not, we need to re-calculate) | 
|  | template <typename T> | 
|  | bool kmp_calc_one_iv_for_chunk_end_XX( | 
|  | const bounds_infoXX_template<T> *bounds, | 
|  | const bounds_infoXX_template<T> *updated_bounds, | 
|  | /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations, | 
|  | kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start, | 
|  | const kmp_point_t original_ivs_start) { | 
|  |  | 
|  | // typedef  std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64> | 
|  | // big_span_t; | 
|  |  | 
|  | // OMPTODO: is it good enough, or do we need ST or do we need big_span_t? | 
|  | T temp = 0; | 
|  |  | 
|  | T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]); | 
|  |  | 
|  | if (start_with_lower_bound) { | 
|  | // we moved to the next iteration on one of outer loops, may as well use | 
|  | // the lower bound here: | 
|  | temp = bounds->lb0 + bounds->lb1 * outer_iv; | 
|  | } else { | 
|  | // Start in expanded space, but: | 
|  | // - we need to hit original space lower bound, so need to account for | 
|  | // that | 
|  | // - we have to go into original space, even if that means adding more | 
|  | // iterations than was planned | 
|  | // - we have to go past (or equal to) previous point (which is the chunk | 
|  | // starting point) | 
|  |  | 
|  | auto iteration = iterations[ind]; | 
|  |  | 
|  | auto step = bounds->step; | 
|  |  | 
|  | // In case of >= it's negative: | 
|  | auto accountForStep = | 
|  | ((bounds->lb0 + bounds->lb1 * outer_iv) - | 
|  | (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) % | 
|  | step; | 
|  |  | 
|  | temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv + | 
|  | accountForStep + iteration * step; | 
|  |  | 
|  | if (((bounds->comparison == comparison_t::comp_less_or_eq) && | 
|  | (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) || | 
|  | ((bounds->comparison == comparison_t::comp_greater_or_eq) && | 
|  | (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) { | 
|  | // Too small (or too big), didn't reach the original lower bound. Use | 
|  | // heuristic: | 
|  | temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step; | 
|  | } | 
|  |  | 
|  | if (compare_with_start) { | 
|  |  | 
|  | T start = static_cast<T>(original_ivs_start[ind]); | 
|  |  | 
|  | temp = kmp_fix_iv(bounds->loop_iv_type, temp); | 
|  |  | 
|  | // On all previous levels start of the chunk is same as the end, need to | 
|  | // be really careful here: | 
|  | if (((bounds->comparison == comparison_t::comp_less_or_eq) && | 
|  | (temp < start)) || | 
|  | ((bounds->comparison == comparison_t::comp_greater_or_eq) && | 
|  | (temp > start))) { | 
|  | // End of the chunk can't be smaller (for >= bigger) than it's start. | 
|  | // Use heuristic: | 
|  | temp = start + iteration / 4 * step; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp); | 
|  |  | 
|  | if (((bounds->comparison == comparison_t::comp_less_or_eq) && | 
|  | (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) || | 
|  | ((bounds->comparison == comparison_t::comp_greater_or_eq) && | 
|  | (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) { | 
|  | // Too big (or too small for >=). | 
|  | return false; | 
|  | } | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | // For one level calculate old induction variable corresponding to overall | 
|  | // new_iv for the chunk end. | 
|  | bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds, | 
|  | const bounds_info_t *updated_bounds, | 
|  | /*in/out*/ kmp_point_t original_ivs, | 
|  | const kmp_iterations_t iterations, | 
|  | kmp_index_t ind, bool start_with_lower_bound, | 
|  | bool compare_with_start, | 
|  | const kmp_point_t original_ivs_start) { | 
|  |  | 
|  | switch (bounds->loop_type) { | 
|  | case loop_type_t::loop_type_int32: | 
|  | return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>( | 
|  | (bounds_infoXX_template<kmp_int32> *)(bounds), | 
|  | (bounds_infoXX_template<kmp_int32> *)(updated_bounds), | 
|  | /*in/out*/ | 
|  | original_ivs, iterations, ind, start_with_lower_bound, | 
|  | compare_with_start, original_ivs_start); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint32: | 
|  | return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>( | 
|  | (bounds_infoXX_template<kmp_uint32> *)(bounds), | 
|  | (bounds_infoXX_template<kmp_uint32> *)(updated_bounds), | 
|  | /*in/out*/ | 
|  | original_ivs, iterations, ind, start_with_lower_bound, | 
|  | compare_with_start, original_ivs_start); | 
|  | break; | 
|  | case loop_type_t::loop_type_int64: | 
|  | return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>( | 
|  | (bounds_infoXX_template<kmp_int64> *)(bounds), | 
|  | (bounds_infoXX_template<kmp_int64> *)(updated_bounds), | 
|  | /*in/out*/ | 
|  | original_ivs, iterations, ind, start_with_lower_bound, | 
|  | compare_with_start, original_ivs_start); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint64: | 
|  | return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>( | 
|  | (bounds_infoXX_template<kmp_uint64> *)(bounds), | 
|  | (bounds_infoXX_template<kmp_uint64> *)(updated_bounds), | 
|  | /*in/out*/ | 
|  | original_ivs, iterations, ind, start_with_lower_bound, | 
|  | compare_with_start, original_ivs_start); | 
|  | break; | 
|  | default: | 
|  | KMP_ASSERT(false); | 
|  | return false; | 
|  | } | 
|  | } | 
|  |  | 
|  | // Calculate old induction variables corresponding to overall new_iv for the | 
|  | // chunk end. If due to space extension we are getting old IVs outside of the | 
|  | // boundaries, bring them into the boundaries. Need to do this in the runtime, | 
|  | // esp. on the lower bounds side. When getting result need to make sure that the | 
|  | // new chunk starts at next position to old chunk, not overlaps with it (this is | 
|  | // done elsewhere), and need to make sure end of the chunk is further than the | 
|  | // beginning of the chunk. We don't need an exact ending point here, just | 
|  | // something more-or-less close to the desired chunk length, bigger is fine | 
|  | // (smaller would be fine, but we risk going into infinite loop, so do smaller | 
|  | // only at the very end of the space). result: false if could not find the | 
|  | // ending point in the original loop space. In this case the caller can use | 
|  | // original upper bounds as the end of the chunk. Chunk won't be empty, because | 
|  | // it'll have at least the starting point, which is by construction in the | 
|  | // original space. | 
|  | bool kmp_calc_original_ivs_for_chunk_end( | 
|  | const bounds_info_t *original_bounds_nest, kmp_index_t n, | 
|  | const bounds_info_internal_t *updated_bounds_nest, | 
|  | const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv, | 
|  | /*out*/ kmp_point_t original_ivs) { | 
|  |  | 
|  | // Iterations in the expanded space: | 
|  | CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); | 
|  | // First, calc corresponding iteration in every modified loop: | 
|  | for (kmp_index_t ind = n; ind > 0;) { | 
|  | --ind; | 
|  | auto &updated_bounds = updated_bounds_nest[ind]; | 
|  |  | 
|  | // should be optimized to OPDIVREM: | 
|  | auto new_ind = new_iv / updated_bounds.b.trip_count; | 
|  | auto iteration = new_iv % updated_bounds.b.trip_count; | 
|  |  | 
|  | new_iv = new_ind; | 
|  | iterations[ind] = iteration; | 
|  | } | 
|  | KMP_DEBUG_ASSERT(new_iv == 0); | 
|  |  | 
|  | kmp_index_t lengthened_ind = n; | 
|  | kmp_index_t equal_ind = -1; | 
|  |  | 
|  | // Next calculate the point, but in original loop nest. | 
|  | for (kmp_index_t ind = 0; ind < n;) { | 
|  | auto bounds = &(original_bounds_nest[ind]); | 
|  | auto updated_bounds = &(updated_bounds_nest[ind].b); | 
|  |  | 
|  | bool good = kmp_calc_one_iv_for_chunk_end( | 
|  | bounds, updated_bounds, | 
|  | /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind), | 
|  | (equal_ind >= ind - 1), original_ivs_start); | 
|  |  | 
|  | if (!good) { | 
|  | // Too big (or too small for >=). | 
|  | if (ind == 0) { | 
|  | // Need to reduce to the end. | 
|  | return false; | 
|  | } else { | 
|  | // Go to next iteration on outer loop: | 
|  | --ind; | 
|  | ++(iterations[ind]); | 
|  | lengthened_ind = ind; | 
|  | if (equal_ind >= lengthened_ind) { | 
|  | // We've changed the number of iterations here, | 
|  | // can't be same anymore: | 
|  | equal_ind = lengthened_ind - 1; | 
|  | } | 
|  | for (kmp_index_t i = ind + 1; i < n; ++i) { | 
|  | iterations[i] = 0; | 
|  | } | 
|  | continue; | 
|  | } | 
|  | } | 
|  |  | 
|  | if ((equal_ind == ind - 1) && | 
|  | (kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind], | 
|  | original_ivs_start[ind]))) { | 
|  | equal_ind = ind; | 
|  | } else if ((equal_ind > ind - 1) && | 
|  | !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind], | 
|  | original_ivs_start[ind]))) { | 
|  | equal_ind = ind - 1; | 
|  | } | 
|  | ++ind; | 
|  | } | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | //----------Calculate upper bounds for the last chunk------------------------- | 
|  |  | 
|  | // Calculate one upper bound for the end. | 
|  | template <typename T> | 
|  | void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds, | 
|  | /*in/out*/ kmp_point_t original_ivs, | 
|  | kmp_index_t ind) { | 
|  |  | 
|  | T temp = bounds->ub0 + | 
|  | bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]); | 
|  |  | 
|  | original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp); | 
|  | } | 
|  |  | 
|  | void kmp_calc_one_iv_end(const bounds_info_t *bounds, | 
|  | /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) { | 
|  |  | 
|  | switch (bounds->loop_type) { | 
|  | default: | 
|  | KMP_ASSERT(false); | 
|  | break; | 
|  | case loop_type_t::loop_type_int32: | 
|  | kmp_calc_one_iv_end_XX<kmp_int32>( | 
|  | (bounds_infoXX_template<kmp_int32> *)(bounds), | 
|  | /*in/out*/ original_ivs, ind); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint32: | 
|  | kmp_calc_one_iv_end_XX<kmp_uint32>( | 
|  | (bounds_infoXX_template<kmp_uint32> *)(bounds), | 
|  | /*in/out*/ original_ivs, ind); | 
|  | break; | 
|  | case loop_type_t::loop_type_int64: | 
|  | kmp_calc_one_iv_end_XX<kmp_int64>( | 
|  | (bounds_infoXX_template<kmp_int64> *)(bounds), | 
|  | /*in/out*/ original_ivs, ind); | 
|  | break; | 
|  | case loop_type_t::loop_type_uint64: | 
|  | kmp_calc_one_iv_end_XX<kmp_uint64>( | 
|  | (bounds_infoXX_template<kmp_uint64> *)(bounds), | 
|  | /*in/out*/ original_ivs, ind); | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | // Calculate upper bounds for the last loop iteration. Just use original upper | 
|  | // bounds (adjusted when canonicalized to use <= / >=). No need to check that | 
|  | // this point is in the original space (it's likely not) | 
|  | void kmp_calc_original_ivs_for_end( | 
|  | const bounds_info_t *const original_bounds_nest, kmp_index_t n, | 
|  | /*out*/ kmp_point_t original_ivs) { | 
|  | for (kmp_index_t ind = 0; ind < n; ++ind) { | 
|  | auto bounds = &(original_bounds_nest[ind]); | 
|  | kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind); | 
|  | } | 
|  | } | 
|  |  | 
|  | /************************************************************************** | 
|  | * Identify nested loop structure - loops come in the canonical form | 
|  | * Lower triangle matrix: i = 0; i <= N; i++        {0,0}:{N,0} | 
|  | *                        j = 0; j <= 0/-1+1*i; j++ {0,0}:{0/-1,1} | 
|  | * Upper Triangle matrix | 
|  | *                        i = 0;     i <= N; i++    {0,0}:{N,0} | 
|  | *                        j = 0+1*i; j <= N; j++    {0,1}:{N,0} | 
|  | * ************************************************************************/ | 
|  | nested_loop_type_t | 
|  | kmp_identify_nested_loop_structure(/*in*/ bounds_info_t *original_bounds_nest, | 
|  | /*in*/ kmp_index_t n) { | 
|  | // only 2-level nested loops are supported | 
|  | if (n != 2) { | 
|  | return nested_loop_type_unkown; | 
|  | } | 
|  | // loops must be canonical | 
|  | KMP_ASSERT( | 
|  | (original_bounds_nest[0].comparison == comparison_t::comp_less_or_eq) && | 
|  | (original_bounds_nest[1].comparison == comparison_t::comp_less_or_eq)); | 
|  | // check outer loop bounds: for triangular need to be {0,0}:{N,0} | 
|  | kmp_uint64 outer_lb0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, | 
|  | original_bounds_nest[0].lb0_u64); | 
|  | kmp_uint64 outer_ub0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, | 
|  | original_bounds_nest[0].ub0_u64); | 
|  | kmp_uint64 outer_lb1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, | 
|  | original_bounds_nest[0].lb1_u64); | 
|  | kmp_uint64 outer_ub1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, | 
|  | original_bounds_nest[0].ub1_u64); | 
|  | if (outer_lb0_u64 != 0 || outer_lb1_u64 != 0 || outer_ub1_u64 != 0) { | 
|  | return nested_loop_type_unkown; | 
|  | } | 
|  | // check inner bounds to determine triangle type | 
|  | kmp_uint64 inner_lb0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, | 
|  | original_bounds_nest[1].lb0_u64); | 
|  | kmp_uint64 inner_ub0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, | 
|  | original_bounds_nest[1].ub0_u64); | 
|  | kmp_uint64 inner_lb1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, | 
|  | original_bounds_nest[1].lb1_u64); | 
|  | kmp_uint64 inner_ub1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, | 
|  | original_bounds_nest[1].ub1_u64); | 
|  | // lower triangle loop inner bounds need to be {0,0}:{0/-1,1} | 
|  | if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 && | 
|  | (inner_ub0_u64 == 0 || inner_ub0_u64 == -1) && inner_ub1_u64 == 1) { | 
|  | return nested_loop_type_lower_triangular_matrix; | 
|  | } | 
|  | // upper triangle loop inner bounds need to be {0,1}:{N,0} | 
|  | if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 && | 
|  | inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0) { | 
|  | return nested_loop_type_upper_triangular_matrix; | 
|  | } | 
|  | return nested_loop_type_unkown; | 
|  | } | 
|  |  | 
|  | /************************************************************************** | 
|  | * SQRT Approximation: https://math.mit.edu/~stevenj/18.335/newton-sqrt.pdf | 
|  | * Start point is x so the result is always > sqrt(x) | 
|  | * The method has uniform convergence, PRECISION is set to 0.1 | 
|  | * ************************************************************************/ | 
|  | #define level_of_precision 0.1 | 
|  | double sqrt_newton_approx(/*in*/ kmp_uint64 x) { | 
|  | double sqrt_old = 0.; | 
|  | double sqrt_new = (double)x; | 
|  | do { | 
|  | sqrt_old = sqrt_new; | 
|  | sqrt_new = (sqrt_old + x / sqrt_old) / 2; | 
|  | } while ((sqrt_old - sqrt_new) > level_of_precision); | 
|  | return sqrt_new; | 
|  | } | 
|  |  | 
|  | /************************************************************************** | 
|  | *  Handle lower triangle matrix in the canonical form | 
|  | *  i = 0; i <= N; i++          {0,0}:{N,0} | 
|  | *  j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1} | 
|  | * ************************************************************************/ | 
|  | void kmp_handle_lower_triangle_matrix( | 
|  | /*in*/ kmp_uint32 nth, | 
|  | /*in*/ kmp_uint32 tid, | 
|  | /*in */ kmp_index_t n, | 
|  | /*in/out*/ bounds_info_t *original_bounds_nest, | 
|  | /*out*/ bounds_info_t *chunk_bounds_nest) { | 
|  |  | 
|  | // transfer loop types from the original loop to the chunks | 
|  | for (kmp_index_t i = 0; i < n; ++i) { | 
|  | chunk_bounds_nest[i] = original_bounds_nest[i]; | 
|  | } | 
|  | // cleanup iv variables | 
|  | kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, | 
|  | original_bounds_nest[0].ub0_u64); | 
|  | kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, | 
|  | original_bounds_nest[0].lb0_u64); | 
|  | kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, | 
|  | original_bounds_nest[1].ub0_u64); | 
|  | // calculate the chunk's lower and upper bounds | 
|  | // the total number of iterations in the loop is the sum of the arithmetic | 
|  | // progression from the outer lower to outer upper bound (inclusive since the | 
|  | // loop is canonical) note that less_than inner loops (inner_ub0 = -1) | 
|  | // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0 | 
|  | // + 1) -> N - 1 | 
|  | kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1) + inner_ub0; | 
|  | kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2; | 
|  | // the current thread's number of iterations: | 
|  | // each thread gets an equal number of iterations: total number of iterations | 
|  | // divided by the number of threads plus, if there's a remainder, | 
|  | // the first threads with the number up to the remainder get an additional | 
|  | // iteration each to cover it | 
|  | kmp_uint64 iter_current = | 
|  | iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0); | 
|  | // cumulative number of iterations executed by all the previous threads: | 
|  | // threads with the tid below the remainder will have (iter_total/nth+1) | 
|  | // elements, and so will all threads before them so the cumulative number of | 
|  | // iterations executed by the all previous will be the current thread's number | 
|  | // of iterations multiplied by the number of previous threads which is equal | 
|  | // to the current thread's tid; threads with the number equal or above the | 
|  | // remainder will have (iter_total/nth) elements so the cumulative number of | 
|  | // iterations previously executed is its number of iterations multipled by the | 
|  | // number of previous threads which is again equal to the current thread's tid | 
|  | // PLUS all the remainder iterations that will have been executed by the | 
|  | // previous threads | 
|  | kmp_uint64 iter_before_current = | 
|  | tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth)); | 
|  | // cumulative number of iterations executed with the current thread is | 
|  | // the cumulative number executed before it plus its own | 
|  | kmp_uint64 iter_with_current = iter_before_current + iter_current; | 
|  | // calculate the outer loop lower bound (lbo) which is the max outer iv value | 
|  | // that gives the number of iterations that is equal or just below the total | 
|  | // number of iterations executed by the previous threads, for less_than | 
|  | // (1-based) inner loops (inner_ub0 == -1) it will be i.e. | 
|  | // lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0 | 
|  | // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be: | 
|  | // i.e. lbo*(lbo+1)/2<=iter_before_current => | 
|  | // lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily | 
|  | // using a parameter to control the equation sign | 
|  | kmp_int64 inner_adjustment = 1 + 2 * inner_ub0; | 
|  | kmp_uint64 lower_bound_outer = | 
|  | (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment + | 
|  | 8 * iter_before_current) + | 
|  | inner_adjustment) / | 
|  | 2 - | 
|  | inner_adjustment; | 
|  | // calculate the inner loop lower bound which is the remaining number of | 
|  | // iterations required to hit the total number of iterations executed by the | 
|  | // previous threads giving the starting point of this thread | 
|  | kmp_uint64 lower_bound_inner = | 
|  | iter_before_current - | 
|  | ((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2; | 
|  | // calculate the outer loop upper bound using the same approach as for the | 
|  | // inner bound except using the total number of iterations executed with the | 
|  | // current thread | 
|  | kmp_uint64 upper_bound_outer = | 
|  | (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment + | 
|  | 8 * iter_with_current) + | 
|  | inner_adjustment) / | 
|  | 2 - | 
|  | inner_adjustment; | 
|  | // calculate the inner loop upper bound which is the remaining number of | 
|  | // iterations required to hit the total number of iterations executed after | 
|  | // the current thread giving the starting point of the next thread | 
|  | kmp_uint64 upper_bound_inner = | 
|  | iter_with_current - | 
|  | ((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2; | 
|  | // adjust the upper bounds down by 1 element to point at the last iteration of | 
|  | // the current thread the first iteration of the next thread | 
|  | if (upper_bound_inner == 0) { | 
|  | // {n,0} => {n-1,n-1} | 
|  | upper_bound_outer -= 1; | 
|  | upper_bound_inner = upper_bound_outer; | 
|  | } else { | 
|  | // {n,m} => {n,m-1} (m!=0) | 
|  | upper_bound_inner -= 1; | 
|  | } | 
|  |  | 
|  | // assign the values, zeroing out lb1 and ub1 values since the iteration space | 
|  | // is now one-dimensional | 
|  | chunk_bounds_nest[0].lb0_u64 = lower_bound_outer; | 
|  | chunk_bounds_nest[1].lb0_u64 = lower_bound_inner; | 
|  | chunk_bounds_nest[0].ub0_u64 = upper_bound_outer; | 
|  | chunk_bounds_nest[1].ub0_u64 = upper_bound_inner; | 
|  | chunk_bounds_nest[0].lb1_u64 = 0; | 
|  | chunk_bounds_nest[0].ub1_u64 = 0; | 
|  | chunk_bounds_nest[1].lb1_u64 = 0; | 
|  | chunk_bounds_nest[1].ub1_u64 = 0; | 
|  |  | 
|  | #if 0 | 
|  | printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n", | 
|  | tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64, | 
|  | chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total); | 
|  | #endif | 
|  | } | 
|  |  | 
|  | /************************************************************************** | 
|  | *  Handle upper triangle matrix in the canonical form | 
|  | *  i = 0; i <= N; i++     {0,0}:{N,0} | 
|  | *  j = 0+1*i; j <= N; j++ {0,1}:{N,0} | 
|  | * ************************************************************************/ | 
|  | void kmp_handle_upper_triangle_matrix( | 
|  | /*in*/ kmp_uint32 nth, | 
|  | /*in*/ kmp_uint32 tid, | 
|  | /*in */ kmp_index_t n, | 
|  | /*in/out*/ bounds_info_t *original_bounds_nest, | 
|  | /*out*/ bounds_info_t *chunk_bounds_nest) { | 
|  |  | 
|  | // transfer loop types from the original loop to the chunks | 
|  | for (kmp_index_t i = 0; i < n; ++i) { | 
|  | chunk_bounds_nest[i] = original_bounds_nest[i]; | 
|  | } | 
|  | // cleanup iv variables | 
|  | kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, | 
|  | original_bounds_nest[0].ub0_u64); | 
|  | kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, | 
|  | original_bounds_nest[0].lb0_u64); | 
|  | [[maybe_unused]] kmp_uint64 inner_ub0 = kmp_fix_iv( | 
|  | original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].ub0_u64); | 
|  | // calculate the chunk's lower and upper bounds | 
|  | // the total number of iterations in the loop is the sum of the arithmetic | 
|  | // progression from the outer lower to outer upper bound (inclusive since the | 
|  | // loop is canonical) note that less_than inner loops (inner_ub0 = -1) | 
|  | // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0 | 
|  | // + 1) -> N - 1 | 
|  | kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1); | 
|  | kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2; | 
|  | // the current thread's number of iterations: | 
|  | // each thread gets an equal number of iterations: total number of iterations | 
|  | // divided by the number of threads plus, if there's a remainder, | 
|  | // the first threads with the number up to the remainder get an additional | 
|  | // iteration each to cover it | 
|  | kmp_uint64 iter_current = | 
|  | iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0); | 
|  | // cumulative number of iterations executed by all the previous threads: | 
|  | // threads with the tid below the remainder will have (iter_total/nth+1) | 
|  | // elements, and so will all threads before them so the cumulative number of | 
|  | // iterations executed by the all previous will be the current thread's number | 
|  | // of iterations multiplied by the number of previous threads which is equal | 
|  | // to the current thread's tid; threads with the number equal or above the | 
|  | // remainder will have (iter_total/nth) elements so the cumulative number of | 
|  | // iterations previously executed is its number of iterations multipled by the | 
|  | // number of previous threads which is again equal to the current thread's tid | 
|  | // PLUS all the remainder iterations that will have been executed by the | 
|  | // previous threads | 
|  | kmp_uint64 iter_before_current = | 
|  | tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth)); | 
|  | // cumulative number of iterations executed with the current thread is | 
|  | // the cumulative number executed before it plus its own | 
|  | kmp_uint64 iter_with_current = iter_before_current + iter_current; | 
|  | // calculate the outer loop lower bound (lbo) which is the max outer iv value | 
|  | // that gives the number of iterations that is equal or just below the total | 
|  | // number of iterations executed by the previous threads: | 
|  | // lbo*(lbo+1)/2<=iter_before_current => | 
|  | // lbo^2+lbo-2*iter_before_current<=0 | 
|  | kmp_uint64 lower_bound_outer = | 
|  | (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1; | 
|  | // calculate the inner loop lower bound which is the remaining number of | 
|  | // iterations required to hit the total number of iterations executed by the | 
|  | // previous threads giving the starting point of this thread | 
|  | kmp_uint64 lower_bound_inner = | 
|  | iter_before_current - ((lower_bound_outer + 1) * lower_bound_outer) / 2; | 
|  | // calculate the outer loop upper bound using the same approach as for the | 
|  | // inner bound except using the total number of iterations executed with the | 
|  | // current thread | 
|  | kmp_uint64 upper_bound_outer = | 
|  | (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_with_current) + 1) / 2 - 1; | 
|  | // calculate the inner loop upper bound which is the remaining number of | 
|  | // iterations required to hit the total number of iterations executed after | 
|  | // the current thread giving the starting point of the next thread | 
|  | kmp_uint64 upper_bound_inner = | 
|  | iter_with_current - ((upper_bound_outer + 1) * upper_bound_outer) / 2; | 
|  | // adjust the upper bounds down by 1 element to point at the last iteration of | 
|  | // the current thread the first iteration of the next thread | 
|  | if (upper_bound_inner == 0) { | 
|  | // {n,0} => {n-1,n-1} | 
|  | upper_bound_outer -= 1; | 
|  | upper_bound_inner = upper_bound_outer; | 
|  | } else { | 
|  | // {n,m} => {n,m-1} (m!=0) | 
|  | upper_bound_inner -= 1; | 
|  | } | 
|  |  | 
|  | // assign the values, zeroing out lb1 and ub1 values since the iteration space | 
|  | // is now one-dimensional | 
|  | chunk_bounds_nest[0].lb0_u64 = (outer_iters - 1) - upper_bound_outer; | 
|  | chunk_bounds_nest[1].lb0_u64 = (outer_iters - 1) - upper_bound_inner; | 
|  | chunk_bounds_nest[0].ub0_u64 = (outer_iters - 1) - lower_bound_outer; | 
|  | chunk_bounds_nest[1].ub0_u64 = (outer_iters - 1) - lower_bound_inner; | 
|  | chunk_bounds_nest[0].lb1_u64 = 0; | 
|  | chunk_bounds_nest[0].ub1_u64 = 0; | 
|  | chunk_bounds_nest[1].lb1_u64 = 0; | 
|  | chunk_bounds_nest[1].ub1_u64 = 0; | 
|  |  | 
|  | #if 0 | 
|  | printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n", | 
|  | tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64, | 
|  | chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total); | 
|  | #endif | 
|  | } | 
|  | //----------Init API for non-rectangular loops-------------------------------- | 
|  |  | 
|  | // Init API for collapsed loops (static, no chunks defined). | 
|  | // "bounds_nest" has to be allocated per thread. | 
|  | // API will modify original bounds_nest array to bring it to a canonical form | 
|  | // (only <= and >=, no !=, <, >). If the original loop nest was already in a | 
|  | // canonical form there will be no changes to bounds in bounds_nest array | 
|  | // (only trip counts will be calculated). Internally API will expand the space | 
|  | // to parallelogram/parallelepiped, calculate total, calculate bounds for the | 
|  | // chunks in terms of the new IV, re-calc them in terms of old IVs (especially | 
|  | // important on the left side, to hit the lower bounds and not step over), and | 
|  | // pick the correct chunk for this thread (so it will calculate chunks up to the | 
|  | // needed one). It could be optimized to calculate just this chunk, potentially | 
|  | // a bit less well distributed among threads. It is designed to make sure that | 
|  | // threads will receive predictable chunks, deterministically (so that next nest | 
|  | // of loops with similar characteristics will get exactly same chunks on same | 
|  | // threads). | 
|  | // Current contract: chunk_bounds_nest has only lb0 and ub0, | 
|  | // lb1 and ub1 are set to 0 and can be ignored. (This may change in the future). | 
|  | extern "C" kmp_int32 | 
|  | __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid, | 
|  | /*in/out*/ bounds_info_t *original_bounds_nest, | 
|  | /*out*/ bounds_info_t *chunk_bounds_nest, | 
|  | kmp_index_t n, /*out*/ kmp_int32 *plastiter) { | 
|  |  | 
|  | KMP_DEBUG_ASSERT(plastiter && original_bounds_nest); | 
|  | KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid)); | 
|  |  | 
|  | if (__kmp_env_consistency_check) { | 
|  | __kmp_push_workshare(gtid, ct_pdo, loc); | 
|  | } | 
|  |  | 
|  | kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n); | 
|  |  | 
|  | CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n); | 
|  |  | 
|  | for (kmp_index_t i = 0; i < n; ++i) { | 
|  | updated_bounds_nest[i].b = original_bounds_nest[i]; | 
|  | } | 
|  |  | 
|  | kmp_loop_nest_iv_t total = | 
|  | kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n); | 
|  |  | 
|  | if (plastiter != NULL) { | 
|  | *plastiter = FALSE; | 
|  | } | 
|  |  | 
|  | if (total == 0) { | 
|  | // Loop won't execute: | 
|  | return FALSE; | 
|  | } | 
|  |  | 
|  | // OMPTODO: DISTRIBUTE is not supported yet | 
|  | __kmp_assert_valid_gtid(gtid); | 
|  | kmp_uint32 tid = __kmp_tid_from_gtid(gtid); | 
|  |  | 
|  | kmp_info_t *th = __kmp_threads[gtid]; | 
|  | kmp_team_t *team = th->th.th_team; | 
|  | kmp_uint32 nth = team->t.t_nproc; // Number of threads | 
|  |  | 
|  | KMP_DEBUG_ASSERT(tid < nth); | 
|  |  | 
|  | // Handle special cases | 
|  | nested_loop_type_t loop_type = | 
|  | kmp_identify_nested_loop_structure(original_bounds_nest, n); | 
|  | if (loop_type == nested_loop_type_lower_triangular_matrix) { | 
|  | kmp_handle_lower_triangle_matrix(nth, tid, n, original_bounds_nest, | 
|  | chunk_bounds_nest); | 
|  | return TRUE; | 
|  | } else if (loop_type == nested_loop_type_upper_triangular_matrix) { | 
|  | kmp_handle_upper_triangle_matrix(nth, tid, n, original_bounds_nest, | 
|  | chunk_bounds_nest); | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | CollapseAllocator<kmp_uint64> original_ivs_start(n); | 
|  |  | 
|  | if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n, | 
|  | /*out*/ original_ivs_start)) { | 
|  | // Loop won't execute: | 
|  | return FALSE; | 
|  | } | 
|  |  | 
|  | // Not doing this optimization for one thread: | 
|  | // (1) more to test | 
|  | // (2) without it current contract that chunk_bounds_nest has only lb0 and | 
|  | // ub0, lb1 and ub1 are set to 0 and can be ignored. | 
|  | // if (nth == 1) { | 
|  | //  // One thread: | 
|  | //  // Copy all info from original_bounds_nest, it'll be good enough. | 
|  |  | 
|  | //  for (kmp_index_t i = 0; i < n; ++i) { | 
|  | //    chunk_bounds_nest[i] = original_bounds_nest[i]; | 
|  | //  } | 
|  |  | 
|  | //  if (plastiter != NULL) { | 
|  | //    *plastiter = TRUE; | 
|  | //  } | 
|  | //  return TRUE; | 
|  | //} | 
|  |  | 
|  | kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs( | 
|  | updated_bounds_nest, original_ivs_start, n); | 
|  |  | 
|  | bool last_iter = false; | 
|  |  | 
|  | for (; nth > 0;) { | 
|  | // We could calculate chunk size once, but this is to compensate that the | 
|  | // original space is not parallelepiped and some threads can be left | 
|  | // without work: | 
|  | KMP_DEBUG_ASSERT(total >= new_iv); | 
|  |  | 
|  | kmp_loop_nest_iv_t total_left = total - new_iv; | 
|  | kmp_loop_nest_iv_t chunk_size = total_left / nth; | 
|  | kmp_loop_nest_iv_t remainder = total_left % nth; | 
|  |  | 
|  | kmp_loop_nest_iv_t curr_chunk_size = chunk_size; | 
|  |  | 
|  | if (remainder > 0) { | 
|  | ++curr_chunk_size; | 
|  | --remainder; | 
|  | } | 
|  |  | 
|  | #if defined(KMP_DEBUG) | 
|  | kmp_loop_nest_iv_t new_iv_for_start = new_iv; | 
|  | #endif | 
|  |  | 
|  | if (curr_chunk_size > 1) { | 
|  | new_iv += curr_chunk_size - 1; | 
|  | } | 
|  |  | 
|  | CollapseAllocator<kmp_uint64> original_ivs_end(n); | 
|  | if ((nth == 1) || (new_iv >= total - 1)) { | 
|  | // Do this one till the end - just in case we miscalculated | 
|  | // and either too much is left to process or new_iv is a bit too big: | 
|  | kmp_calc_original_ivs_for_end(original_bounds_nest, n, | 
|  | /*out*/ original_ivs_end); | 
|  |  | 
|  | last_iter = true; | 
|  | } else { | 
|  | // Note: here we make sure it's past (or equal to) the previous point. | 
|  | if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n, | 
|  | updated_bounds_nest, | 
|  | original_ivs_start, new_iv, | 
|  | /*out*/ original_ivs_end)) { | 
|  | // We could not find the ending point, use the original upper bounds: | 
|  | kmp_calc_original_ivs_for_end(original_bounds_nest, n, | 
|  | /*out*/ original_ivs_end); | 
|  |  | 
|  | last_iter = true; | 
|  | } | 
|  | } | 
|  |  | 
|  | #if defined(KMP_DEBUG) | 
|  | auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs( | 
|  | updated_bounds_nest, original_ivs_end, n); | 
|  | KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start); | 
|  | #endif | 
|  |  | 
|  | if (last_iter && (tid != 0)) { | 
|  | // We are done, this was last chunk, but no chunk for current thread was | 
|  | // found: | 
|  | return FALSE; | 
|  | } | 
|  |  | 
|  | if (tid == 0) { | 
|  | // We found the chunk for this thread, now we need to check if it's the | 
|  | // last chunk or not: | 
|  |  | 
|  | CollapseAllocator<kmp_uint64> original_ivs_next_start(n); | 
|  | if (last_iter || | 
|  | !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end, | 
|  | /*out*/ original_ivs_next_start)) { | 
|  | // no more loop iterations left to process, | 
|  | // this means that currently found chunk is the last chunk: | 
|  | if (plastiter != NULL) { | 
|  | *plastiter = TRUE; | 
|  | } | 
|  | } | 
|  |  | 
|  | // Fill in chunk bounds: | 
|  | for (kmp_index_t i = 0; i < n; ++i) { | 
|  | chunk_bounds_nest[i] = | 
|  | original_bounds_nest[i]; // To fill in types, etc. - optional | 
|  | chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i]; | 
|  | chunk_bounds_nest[i].lb1_u64 = 0; | 
|  |  | 
|  | chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i]; | 
|  | chunk_bounds_nest[i].ub1_u64 = 0; | 
|  | } | 
|  |  | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | --tid; | 
|  | --nth; | 
|  |  | 
|  | bool next_chunk = kmp_calc_next_original_ivs( | 
|  | original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start); | 
|  | if (!next_chunk) { | 
|  | // no more loop iterations to process, | 
|  | // the prevoius chunk was the last chunk | 
|  | break; | 
|  | } | 
|  |  | 
|  | // original_ivs_start is next to previous chunk original_ivs_end, | 
|  | // we need to start new chunk here, so chunks will be one after another | 
|  | // without any gap or overlap: | 
|  | new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest, | 
|  | original_ivs_start, n); | 
|  | } | 
|  |  | 
|  | return FALSE; | 
|  | } |