runtime/src/kmp_collapse.h - llvm-project/openmp - Git at Google

 /*
  * kmp_collapse.h -- header for loop collapse feature
  */

 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #ifndef KMP_COLLAPSE_H
 #define KMP_COLLAPSE_H

 #include <type_traits>

 // Type of the index into the loop nest structures
 // (with values from 0 to less than n from collapse(n))
 typedef kmp_int32 kmp_index_t;

 // Type for combined loop nest space IV:
 typedef kmp_uint64 kmp_loop_nest_iv_t;

 // Loop has <, <=, etc. as a comparison:
 enum comparison_t : kmp_int32 {
   comp_less_or_eq = 0,
   comp_greater_or_eq = 1,
   comp_not_eq = 2,
   comp_less = 3,
   comp_greater = 4
 };

 // Type of loop IV.
 // Type of bounds and step, after usual promotions
 // are a subset of these types (32 & 64 only):
 enum loop_type_t : kmp_int32 {
   loop_type_uint8 = 0,
   loop_type_int8 = 1,
   loop_type_uint16 = 2,
   loop_type_int16 = 3,
   loop_type_uint32 = 4,
   loop_type_int32 = 5,
   loop_type_uint64 = 6,
   loop_type_int64 = 7
 };

 // Defining loop types to handle special cases
 enum nested_loop_type_t : kmp_int32 {
   nested_loop_type_unkown = 0,
   nested_loop_type_lower_triangular_matrix = 1,
   nested_loop_type_upper_triangular_matrix = 2
 };

 /*!
  @ingroup WORK_SHARING
  * Describes the structure for rectangular nested loops.
  */
 template <typename T> struct bounds_infoXX_template {

   // typedef typename traits_t<T>::unsigned_t UT;
   typedef typename traits_t<T>::signed_t ST;

   loop_type_t loop_type; // The differentiator
   loop_type_t loop_iv_type;
   comparison_t comparison;
   // outer_iv should be 0 (or any other less then number of dimentions)
   // if loop doesn't depend on it (lb1 and ub1 will be 0).
   // This way we can do multiplication without a check.
   kmp_index_t outer_iv;

   // unions to keep the size constant:
   union {
     T lb0;
     kmp_uint64 lb0_u64; // real type can be signed
   };

   union {
     T lb1;
     kmp_uint64 lb1_u64; // real type can be signed
   };

   union {
     T ub0;
     kmp_uint64 ub0_u64; // real type can be signed
   };

   union {
     T ub1;
     kmp_uint64 ub1_u64; // real type can be signed
   };

   union {
     ST step; // signed even if bounds type is unsigned
     kmp_int64 step_64; // signed
   };

   kmp_loop_nest_iv_t trip_count;
 };

 /*!
  @ingroup WORK_SHARING
  * Interface struct for rectangular nested loops.
  * Same size as bounds_infoXX_template.
  */
 struct bounds_info_t {

   loop_type_t loop_type; // The differentiator
   loop_type_t loop_iv_type;
   comparison_t comparison;
   // outer_iv should be 0  (or any other less then number of dimentions)
   // if loop doesn't depend on it (lb1 and ub1 will be 0).
   // This way we can do multiplication without a check.
   kmp_index_t outer_iv;

   kmp_uint64 lb0_u64; // real type can be signed
   kmp_uint64 lb1_u64; // real type can be signed
   kmp_uint64 ub0_u64; // real type can be signed
   kmp_uint64 ub1_u64; // real type can be signed
   kmp_int64 step_64; // signed

   // This is internal, but it's the only internal thing we need
   // in rectangular case, so let's expose it here:
   kmp_loop_nest_iv_t trip_count;
 };

 //-------------------------------------------------------------------------
 // Additional types for internal representation:

 // Array for a point in the loop space, in the original space.
 // It's represented in kmp_uint64, but each dimention is calculated in
 // that loop IV type. Also dimentions have to be converted to those types
 // when used in generated code.
 typedef kmp_uint64 *kmp_point_t;

 // Array: Number of loop iterations on each nesting level to achieve some point,
 // in expanded space or in original space.
 // OMPTODO: move from using iterations to using offsets (iterations multiplied
 // by steps). For those we need to be careful with the types, as step can be
 // negative, but it'll remove multiplications and divisions in several places.
 typedef kmp_loop_nest_iv_t *kmp_iterations_t;

 // Internal struct with additional info:
 template <typename T> struct bounds_info_internalXX_template {

   // OMPTODO: should span have type T or should it better be
   // kmp_uint64/kmp_int64 depending on T sign? (if kmp_uint64/kmp_int64 than
   // updated bounds should probably also be kmp_uint64/kmp_int64). I'd like to
   // use big_span_t, if it can be resolved at compile time.
   typedef
       typename std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
           big_span_t;

   // typedef typename big_span_t span_t;
   typedef T span_t;

   bounds_infoXX_template<T> b; // possibly adjusted bounds

   // Leaving this as a union in case we'll switch to span_t with different sizes
   // (depending on T)
   union {
     // Smallest possible value of iv (may be smaller than actually possible)
     span_t span_smallest;
     kmp_uint64 span_smallest_u64;
   };

   // Leaving this as a union in case we'll switch to span_t with different sizes
   // (depending on T)
   union {
     // Biggest possible value of iv (may be bigger than actually possible)
     span_t span_biggest;
     kmp_uint64 span_biggest_u64;
   };

   // Did we adjust loop bounds (not counting canonicalization)?
   bool loop_bounds_adjusted;
 };

 // Internal struct with additional info:
 struct bounds_info_internal_t {

   bounds_info_t b; // possibly adjusted bounds

   // Smallest possible value of iv (may be smaller than actually possible)
   kmp_uint64 span_smallest_u64;

   // Biggest possible value of iv (may be bigger than actually possible)
   kmp_uint64 span_biggest_u64;

   // Did we adjust loop bounds (not counting canonicalization)?
   bool loop_bounds_adjusted;
 };

 //----------APIs for rectangular loop nests--------------------------------

 // Canonicalize loop nest and calculate overall trip count.
 // "bounds_nest" has to be allocated per thread.
 // API will modify original bounds_nest array to bring it to a canonical form
 // (only <= and >=, no !=, <, >). If the original loop nest was already in a
 // canonical form there will be no changes to bounds in bounds_nest array
 // (only trip counts will be calculated).
 // Returns trip count of overall space.
 extern "C" kmp_loop_nest_iv_t
 __kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
                                  /*in/out*/ bounds_info_t *original_bounds_nest,
                                  kmp_index_t n);

 // Calculate old induction variables corresponding to overall new_iv.
 // Note: original IV will be returned as if it had kmp_uint64 type,
 // will have to be converted to original type in user code.
 // Note: trip counts should be already calculated by
 // __kmpc_process_loop_nest_rectang.
 // OMPTODO: special case 2, 3 nested loops - if it'll be possible to inline
 // that into user code.
 extern "C" void
 __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
                                  const bounds_info_t *original_bounds_nest,
                                  /*out*/ kmp_uint64 *original_ivs,
                                  kmp_index_t n);

 //----------Init API for non-rectangular loops--------------------------------

 // Init API for collapsed loops (static, no chunks defined).
 // "bounds_nest" has to be allocated per thread.
 // API will modify original bounds_nest array to bring it to a canonical form
 // (only <= and >=, no !=, <, >). If the original loop nest was already in a
 // canonical form there will be no changes to bounds in bounds_nest array
 // (only trip counts will be calculated). Internally API will expand the space
 // to parallelogram/parallelepiped, calculate total, calculate bounds for the
 // chunks in terms of the new IV, re-calc them in terms of old IVs (especially
 // important on the left side, to hit the lower bounds and not step over), and
 // pick the correct chunk for this thread (so it will calculate chunks up to the
 // needed one). It could be optimized to calculate just this chunk, potentially
 // a bit less well distributed among threads. It is designed to make sure that
 // threads will receive predictable chunks, deterministically (so that next nest
 // of loops with similar characteristics will get exactly same chunks on same
 // threads).
 // Current contract: chunk_bounds_nest has only lb0 and ub0,
 // lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
 extern "C" kmp_int32
 __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
                           /*in/out*/ bounds_info_t *original_bounds_nest,
                           /*out*/ bounds_info_t *chunk_bounds_nest,
                           kmp_index_t n,
                           /*out*/ kmp_int32 *plastiter);

 #endif // KMP_COLLAPSE_H
	/*
	* kmp_collapse.h -- header for loop collapse feature
	*/

	//===----------------------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#ifndef KMP_COLLAPSE_H
	#define KMP_COLLAPSE_H

	#include <type_traits>

	// Type of the index into the loop nest structures
	// (with values from 0 to less than n from collapse(n))
	typedef kmp_int32 kmp_index_t;

	// Type for combined loop nest space IV:
	typedef kmp_uint64 kmp_loop_nest_iv_t;

	// Loop has <, <=, etc. as a comparison:
	enum comparison_t : kmp_int32 {
	comp_less_or_eq = 0,
	comp_greater_or_eq = 1,
	comp_not_eq = 2,
	comp_less = 3,
	comp_greater = 4
	};

	// Type of loop IV.
	// Type of bounds and step, after usual promotions
	// are a subset of these types (32 & 64 only):
	enum loop_type_t : kmp_int32 {
	loop_type_uint8 = 0,
	loop_type_int8 = 1,
	loop_type_uint16 = 2,
	loop_type_int16 = 3,
	loop_type_uint32 = 4,
	loop_type_int32 = 5,
	loop_type_uint64 = 6,
	loop_type_int64 = 7
	};

	// Defining loop types to handle special cases
	enum nested_loop_type_t : kmp_int32 {
	nested_loop_type_unkown = 0,
	nested_loop_type_lower_triangular_matrix = 1,
	nested_loop_type_upper_triangular_matrix = 2
	};

	/*!
	@ingroup WORK_SHARING
	* Describes the structure for rectangular nested loops.
	*/
	template <typename T> struct bounds_infoXX_template {

	// typedef typename traits_t<T>::unsigned_t UT;
	typedef typename traits_t<T>::signed_t ST;

	loop_type_t loop_type; // The differentiator
	loop_type_t loop_iv_type;
	comparison_t comparison;
	// outer_iv should be 0 (or any other less then number of dimentions)
	// if loop doesn't depend on it (lb1 and ub1 will be 0).
	// This way we can do multiplication without a check.
	kmp_index_t outer_iv;

	// unions to keep the size constant:
	union {
	T lb0;
	kmp_uint64 lb0_u64; // real type can be signed
	};

	union {
	T lb1;
	kmp_uint64 lb1_u64; // real type can be signed
	};

	union {
	T ub0;
	kmp_uint64 ub0_u64; // real type can be signed
	};

	union {
	T ub1;
	kmp_uint64 ub1_u64; // real type can be signed
	};

	union {
	ST step; // signed even if bounds type is unsigned
	kmp_int64 step_64; // signed
	};

	kmp_loop_nest_iv_t trip_count;
	};

	/*!
	@ingroup WORK_SHARING
	* Interface struct for rectangular nested loops.
	* Same size as bounds_infoXX_template.
	*/
	struct bounds_info_t {

	loop_type_t loop_type; // The differentiator
	loop_type_t loop_iv_type;
	comparison_t comparison;
	// outer_iv should be 0 (or any other less then number of dimentions)
	// if loop doesn't depend on it (lb1 and ub1 will be 0).
	// This way we can do multiplication without a check.
	kmp_index_t outer_iv;

	kmp_uint64 lb0_u64; // real type can be signed
	kmp_uint64 lb1_u64; // real type can be signed
	kmp_uint64 ub0_u64; // real type can be signed
	kmp_uint64 ub1_u64; // real type can be signed
	kmp_int64 step_64; // signed

	// This is internal, but it's the only internal thing we need
	// in rectangular case, so let's expose it here:
	kmp_loop_nest_iv_t trip_count;
	};

	//-------------------------------------------------------------------------
	// Additional types for internal representation:

	// Array for a point in the loop space, in the original space.
	// It's represented in kmp_uint64, but each dimention is calculated in
	// that loop IV type. Also dimentions have to be converted to those types
	// when used in generated code.
	typedef kmp_uint64 *kmp_point_t;

	// Array: Number of loop iterations on each nesting level to achieve some point,
	// in expanded space or in original space.
	// OMPTODO: move from using iterations to using offsets (iterations multiplied
	// by steps). For those we need to be careful with the types, as step can be
	// negative, but it'll remove multiplications and divisions in several places.
	typedef kmp_loop_nest_iv_t *kmp_iterations_t;

	// Internal struct with additional info:
	template <typename T> struct bounds_info_internalXX_template {

	// OMPTODO: should span have type T or should it better be
	// kmp_uint64/kmp_int64 depending on T sign? (if kmp_uint64/kmp_int64 than
	// updated bounds should probably also be kmp_uint64/kmp_int64). I'd like to
	// use big_span_t, if it can be resolved at compile time.
	typedef
	typename std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
	big_span_t;

	// typedef typename big_span_t span_t;
	typedef T span_t;

	bounds_infoXX_template<T> b; // possibly adjusted bounds

	// Leaving this as a union in case we'll switch to span_t with different sizes
	// (depending on T)
	union {
	// Smallest possible value of iv (may be smaller than actually possible)
	span_t span_smallest;
	kmp_uint64 span_smallest_u64;
	};

	// Leaving this as a union in case we'll switch to span_t with different sizes
	// (depending on T)
	union {
	// Biggest possible value of iv (may be bigger than actually possible)
	span_t span_biggest;
	kmp_uint64 span_biggest_u64;
	};

	// Did we adjust loop bounds (not counting canonicalization)?
	bool loop_bounds_adjusted;
	};

	// Internal struct with additional info:
	struct bounds_info_internal_t {

	bounds_info_t b; // possibly adjusted bounds

	// Smallest possible value of iv (may be smaller than actually possible)
	kmp_uint64 span_smallest_u64;

	// Biggest possible value of iv (may be bigger than actually possible)
	kmp_uint64 span_biggest_u64;

	// Did we adjust loop bounds (not counting canonicalization)?
	bool loop_bounds_adjusted;
	};

	//----------APIs for rectangular loop nests--------------------------------

	// Canonicalize loop nest and calculate overall trip count.
	// "bounds_nest" has to be allocated per thread.
	// API will modify original bounds_nest array to bring it to a canonical form
	// (only <= and >=, no !=, <, >). If the original loop nest was already in a
	// canonical form there will be no changes to bounds in bounds_nest array
	// (only trip counts will be calculated).
	// Returns trip count of overall space.
	extern "C" kmp_loop_nest_iv_t
	__kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
	/in/out/ bounds_info_t *original_bounds_nest,
	kmp_index_t n);

	// Calculate old induction variables corresponding to overall new_iv.
	// Note: original IV will be returned as if it had kmp_uint64 type,
	// will have to be converted to original type in user code.
	// Note: trip counts should be already calculated by
	// __kmpc_process_loop_nest_rectang.
	// OMPTODO: special case 2, 3 nested loops - if it'll be possible to inline
	// that into user code.
	extern "C" void
	__kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
	const bounds_info_t *original_bounds_nest,
	/out/ kmp_uint64 *original_ivs,
	kmp_index_t n);

	//----------Init API for non-rectangular loops--------------------------------

	// Init API for collapsed loops (static, no chunks defined).
	// "bounds_nest" has to be allocated per thread.
	// API will modify original bounds_nest array to bring it to a canonical form
	// (only <= and >=, no !=, <, >). If the original loop nest was already in a
	// canonical form there will be no changes to bounds in bounds_nest array
	// (only trip counts will be calculated). Internally API will expand the space
	// to parallelogram/parallelepiped, calculate total, calculate bounds for the
	// chunks in terms of the new IV, re-calc them in terms of old IVs (especially
	// important on the left side, to hit the lower bounds and not step over), and
	// pick the correct chunk for this thread (so it will calculate chunks up to the
	// needed one). It could be optimized to calculate just this chunk, potentially
	// a bit less well distributed among threads. It is designed to make sure that
	// threads will receive predictable chunks, deterministically (so that next nest
	// of loops with similar characteristics will get exactly same chunks on same
	// threads).
	// Current contract: chunk_bounds_nest has only lb0 and ub0,
	// lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
	extern "C" kmp_int32
	__kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
	/in/out/ bounds_info_t *original_bounds_nest,
	/out/ bounds_info_t *chunk_bounds_nest,
	kmp_index_t n,
	/out/ kmp_int32 *plastiter);

	#endif // KMP_COLLAPSE_H