openmp/runtime/src/kmp_barrier.h - llvm-project - Git at Google

 /*
  * kmp_barrier.h
  */

 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #ifndef KMP_BARRIER_H
 #define KMP_BARRIER_H

 #include "kmp.h"
 #include "kmp_i18n.h"

 #if KMP_HAVE_XMMINTRIN_H && KMP_HAVE__MM_MALLOC
 #include <xmmintrin.h>
 #define KMP_ALIGNED_ALLOCATE(size, alignment) _mm_malloc(size, alignment)
 #define KMP_ALIGNED_FREE(ptr) _mm_free(ptr)
 #elif KMP_HAVE_ALIGNED_ALLOC
 #define KMP_ALIGNED_ALLOCATE(size, alignment) aligned_alloc(alignment, size)
 #define KMP_ALIGNED_FREE(ptr) free(ptr)
 #elif KMP_HAVE_POSIX_MEMALIGN
 static inline void *KMP_ALIGNED_ALLOCATE(size_t size, size_t alignment) {
   void *ptr;
   int n = posix_memalign(&ptr, alignment, size);
   if (n != 0) {
     if (ptr)
       free(ptr);
     return nullptr;
   }
   return ptr;
 }
 #define KMP_ALIGNED_FREE(ptr) free(ptr)
 #elif KMP_HAVE__ALIGNED_MALLOC
 #include <malloc.h>
 #define KMP_ALIGNED_ALLOCATE(size, alignment) _aligned_malloc(size, alignment)
 #define KMP_ALIGNED_FREE(ptr) _aligned_free(ptr)
 #else
 #define KMP_ALIGNED_ALLOCATE(size, alignment) KMP_INTERNAL_MALLOC(size)
 #define KMP_ALIGNED_FREE(ptr) KMP_INTERNAL_FREE(ptr)
 #endif

 // Use four cache lines: MLC tends to prefetch the next or previous cache line
 // creating a possible fake conflict between cores, so this is the only way to
 // guarantee that no such prefetch can happen.
 #ifndef KMP_FOURLINE_ALIGN_CACHE
 #define KMP_FOURLINE_ALIGN_CACHE KMP_ALIGN(4 * CACHE_LINE)
 #endif

 #define KMP_OPTIMIZE_FOR_REDUCTIONS 0

 class distributedBarrier {
   struct flags_s {
     kmp_uint32 volatile KMP_FOURLINE_ALIGN_CACHE stillNeed;
   };

   struct go_s {
     std::atomic<kmp_uint64> KMP_FOURLINE_ALIGN_CACHE go;
   };

   struct iter_s {
     kmp_uint64 volatile KMP_FOURLINE_ALIGN_CACHE iter;
   };

   struct sleep_s {
     std::atomic<bool> KMP_FOURLINE_ALIGN_CACHE sleep;
   };

   void init(size_t nthr);
   void resize(size_t nthr);
   void computeGo(size_t n);
   void computeVarsForN(size_t n);

 public:
   enum {
     MAX_ITERS = 3,
     MAX_GOS = 8,
     IDEAL_GOS = 4,
     IDEAL_CONTENTION = 16,
   };

   flags_s *flags[MAX_ITERS];
   go_s *go;
   iter_s *iter;
   sleep_s *sleep;

   size_t KMP_ALIGN_CACHE num_threads; // number of threads in barrier
   size_t KMP_ALIGN_CACHE max_threads; // size of arrays in data structure
   // number of go signals each requiring one write per iteration
   size_t KMP_ALIGN_CACHE num_gos;
   // number of groups of gos
   size_t KMP_ALIGN_CACHE num_groups;
   // threads per go signal
   size_t KMP_ALIGN_CACHE threads_per_go;
   bool KMP_ALIGN_CACHE fix_threads_per_go;
   // threads per group
   size_t KMP_ALIGN_CACHE threads_per_group;
   // number of go signals in a group
   size_t KMP_ALIGN_CACHE gos_per_group;
   void *team_icvs;

   distributedBarrier() = delete;
   ~distributedBarrier() = delete;

   // Used instead of constructor to create aligned data
   static distributedBarrier *allocate(int nThreads) {
     distributedBarrier *d = (distributedBarrier *)KMP_ALIGNED_ALLOCATE(
         sizeof(distributedBarrier), 4 * CACHE_LINE);
     if (!d) {
       KMP_FATAL(MemoryAllocFailed);
     }
     d->num_threads = 0;
     d->max_threads = 0;
     for (int i = 0; i < MAX_ITERS; ++i)
       d->flags[i] = NULL;
     d->go = NULL;
     d->iter = NULL;
     d->sleep = NULL;
     d->team_icvs = NULL;
     d->fix_threads_per_go = false;
     // calculate gos and groups ONCE on base size
     d->computeGo(nThreads);
     d->init(nThreads);
     return d;
   }

   static void deallocate(distributedBarrier *db) { KMP_ALIGNED_FREE(db); }

   void update_num_threads(size_t nthr) { init(nthr); }

   bool need_resize(size_t new_nthr) { return (new_nthr > max_threads); }
   size_t get_num_threads() { return num_threads; }
   kmp_uint64 go_release();
   void go_reset();
 };

 #endif // KMP_BARRIER_H
	/*
	* kmp_barrier.h
	*/

	//===----------------------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#ifndef KMP_BARRIER_H
	#define KMP_BARRIER_H

	#include "kmp.h"
	#include "kmp_i18n.h"

	#if KMP_HAVE_XMMINTRIN_H && KMP_HAVE__MM_MALLOC
	#include <xmmintrin.h>
	#define KMP_ALIGNED_ALLOCATE(size, alignment) _mm_malloc(size, alignment)
	#define KMP_ALIGNED_FREE(ptr) _mm_free(ptr)
	#elif KMP_HAVE_ALIGNED_ALLOC
	#define KMP_ALIGNED_ALLOCATE(size, alignment) aligned_alloc(alignment, size)
	#define KMP_ALIGNED_FREE(ptr) free(ptr)
	#elif KMP_HAVE_POSIX_MEMALIGN
	static inline void *KMP_ALIGNED_ALLOCATE(size_t size, size_t alignment) {
	void *ptr;
	int n = posix_memalign(&ptr, alignment, size);
	if (n != 0) {
	if (ptr)
	free(ptr);
	return nullptr;
	}
	return ptr;
	}
	#define KMP_ALIGNED_FREE(ptr) free(ptr)
	#elif KMP_HAVE__ALIGNED_MALLOC
	#include <malloc.h>
	#define KMP_ALIGNED_ALLOCATE(size, alignment) _aligned_malloc(size, alignment)
	#define KMP_ALIGNED_FREE(ptr) _aligned_free(ptr)
	#else
	#define KMP_ALIGNED_ALLOCATE(size, alignment) KMP_INTERNAL_MALLOC(size)
	#define KMP_ALIGNED_FREE(ptr) KMP_INTERNAL_FREE(ptr)
	#endif

	// Use four cache lines: MLC tends to prefetch the next or previous cache line
	// creating a possible fake conflict between cores, so this is the only way to
	// guarantee that no such prefetch can happen.
	#ifndef KMP_FOURLINE_ALIGN_CACHE
	#define KMP_FOURLINE_ALIGN_CACHE KMP_ALIGN(4 * CACHE_LINE)
	#endif

	#define KMP_OPTIMIZE_FOR_REDUCTIONS 0

	class distributedBarrier {
	struct flags_s {
	kmp_uint32 volatile KMP_FOURLINE_ALIGN_CACHE stillNeed;
	};

	struct go_s {
	std::atomic<kmp_uint64> KMP_FOURLINE_ALIGN_CACHE go;
	};

	struct iter_s {
	kmp_uint64 volatile KMP_FOURLINE_ALIGN_CACHE iter;
	};

	struct sleep_s {
	std::atomic<bool> KMP_FOURLINE_ALIGN_CACHE sleep;
	};

	void init(size_t nthr);
	void resize(size_t nthr);
	void computeGo(size_t n);
	void computeVarsForN(size_t n);

	public:
	enum {
	MAX_ITERS = 3,
	MAX_GOS = 8,
	IDEAL_GOS = 4,
	IDEAL_CONTENTION = 16,
	};

	flags_s *flags[MAX_ITERS];
	go_s *go;
	iter_s *iter;
	sleep_s *sleep;

	size_t KMP_ALIGN_CACHE num_threads; // number of threads in barrier
	size_t KMP_ALIGN_CACHE max_threads; // size of arrays in data structure
	// number of go signals each requiring one write per iteration
	size_t KMP_ALIGN_CACHE num_gos;
	// number of groups of gos
	size_t KMP_ALIGN_CACHE num_groups;
	// threads per go signal
	size_t KMP_ALIGN_CACHE threads_per_go;
	bool KMP_ALIGN_CACHE fix_threads_per_go;
	// threads per group
	size_t KMP_ALIGN_CACHE threads_per_group;
	// number of go signals in a group
	size_t KMP_ALIGN_CACHE gos_per_group;
	void *team_icvs;

	distributedBarrier() = delete;
	~distributedBarrier() = delete;

	// Used instead of constructor to create aligned data
	static distributedBarrier *allocate(int nThreads) {
	distributedBarrier d = (distributedBarrier )KMP_ALIGNED_ALLOCATE(
	sizeof(distributedBarrier), 4 * CACHE_LINE);
	if (!d) {
	KMP_FATAL(MemoryAllocFailed);
	}
	d->num_threads = 0;
	d->max_threads = 0;
	for (int i = 0; i < MAX_ITERS; ++i)
	d->flags[i] = NULL;
	d->go = NULL;
	d->iter = NULL;
	d->sleep = NULL;
	d->team_icvs = NULL;
	d->fix_threads_per_go = false;
	// calculate gos and groups ONCE on base size
	d->computeGo(nThreads);
	d->init(nThreads);
	return d;
	}

	static void deallocate(distributedBarrier *db) { KMP_ALIGNED_FREE(db); }

	void update_num_threads(size_t nthr) { init(nthr); }

	bool need_resize(size_t new_nthr) { return (new_nthr > max_threads); }
	size_t get_num_threads() { return num_threads; }
	kmp_uint64 go_release();
	void go_reset();
	};

	#endif // KMP_BARRIER_H