openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu - llvm-project - Git at Google

 //===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file contains the implementation of data sharing environments
 //
 //===----------------------------------------------------------------------===//
 #pragma omp declare target

 #include "common/omptarget.h"
 #include "target/shuffle.h"
 #include "target_impl.h"

 ////////////////////////////////////////////////////////////////////////////////
 // Runtime functions for trunk data sharing scheme.
 ////////////////////////////////////////////////////////////////////////////////

 static constexpr unsigned MinBytes = 8;

 static constexpr unsigned Alignment = 8;

 /// External symbol to access dynamic shared memory.
 extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)

 EXTERN void *__kmpc_get_dynamic_shared() { return DynamicSharedBuffer; }

 EXTERN void *llvm_omp_get_dynamic_shared() {
   return __kmpc_get_dynamic_shared();
 }

 template <unsigned BPerThread, unsigned NThreads = MAX_THREADS_PER_TEAM>
 struct alignas(32) ThreadStackTy {
   static constexpr unsigned BytesPerThread = BPerThread;
   static constexpr unsigned NumThreads = NThreads;
   static constexpr unsigned NumWarps = (NThreads + WARPSIZE - 1) / WARPSIZE;

   unsigned char Data[NumThreads][BytesPerThread];
   unsigned char Usage[NumThreads];
 };

 [[clang::loader_uninitialized]] ThreadStackTy<MinBytes * 8, 1> MainSharedStack;
 #pragma omp allocate(MainSharedStack) allocator(omp_pteam_mem_alloc)

 [[clang::loader_uninitialized]] ThreadStackTy<MinBytes,
                                               MAX_THREADS_PER_TEAM / 4>
     WorkerSharedStack;
 #pragma omp allocate(WorkerSharedStack) allocator(omp_pteam_mem_alloc)

 EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
   size_t AlignedBytes = Bytes + (Bytes % MinBytes);
   int TID = __kmpc_get_hardware_thread_id_in_block();
   if (__kmpc_is_generic_main_thread(TID)) {
     // Main thread alone, use shared memory if space is available.
     if (MainSharedStack.Usage[0] + AlignedBytes <= MainSharedStack.BytesPerThread) {
       void *Ptr = &MainSharedStack.Data[0][MainSharedStack.Usage[0]];
       MainSharedStack.Usage[0] += AlignedBytes;
       return Ptr;
     }
   } else if (TID < WorkerSharedStack.NumThreads) {
     if (WorkerSharedStack.Usage[TID] + AlignedBytes <= WorkerSharedStack.BytesPerThread) {
       void *Ptr = &WorkerSharedStack.Data[TID][WorkerSharedStack.Usage[TID]];
       WorkerSharedStack.Usage[TID] += AlignedBytes;
       return Ptr;
     }
   }
   // Fallback to malloc
   return SafeMalloc(Bytes, "AllocGlobalFallback");
 }

 EXTERN void __kmpc_free_shared(void *Ptr, size_t Bytes) {
   size_t AlignedBytes = Bytes + (Bytes % MinBytes);
   int TID = __kmpc_get_hardware_thread_id_in_block();
   if (__kmpc_is_generic_main_thread(TID)) {
     if (Ptr >= &MainSharedStack.Data[0][0] &&
         Ptr < &MainSharedStack.Data[MainSharedStack.NumThreads][0]) {
       MainSharedStack.Usage[0] -= AlignedBytes;
       return;
     }
   } else if (TID < WorkerSharedStack.NumThreads) {
     if (Ptr >= &WorkerSharedStack.Data[0][0] &&
         Ptr < &WorkerSharedStack.Data[WorkerSharedStack.NumThreads][0]) {
       int TID = __kmpc_get_hardware_thread_id_in_block();
       WorkerSharedStack.Usage[TID] -= AlignedBytes;
       return;
     }
   }
   SafeFree(Ptr, "FreeGlobalFallback");
 }

 EXTERN void __kmpc_data_sharing_init_stack() {
   for (unsigned i = 0; i < MainSharedStack.NumWarps; ++i)
     MainSharedStack.Usage[i] = 0;
   for (unsigned i = 0; i < WorkerSharedStack.NumThreads; ++i)
     WorkerSharedStack.Usage[i] = 0;
 }

 /// Allocate storage in shared memory to communicate arguments from the main
 /// thread to the workers in generic mode. If we exceed
 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
 #define NUM_SHARED_VARIABLES_IN_SHARED_MEM 64

 [[clang::loader_uninitialized]] static void
     *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
 #pragma omp allocate(SharedMemVariableSharingSpace)                            \
     allocator(omp_pteam_mem_alloc)
 [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
 #pragma omp allocate(SharedMemVariableSharingSpacePtr)                         \
     allocator(omp_pteam_mem_alloc)

 // Begin a data sharing context. Maintain a list of references to shared
 // variables. This list of references to shared variables will be passed
 // to one or more threads.
 // In L0 data sharing this is called by master thread.
 // In L1 data sharing this is called by active warp master thread.
 EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
   if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
     SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
   } else {
     SharedMemVariableSharingSpacePtr =
         (void **)SafeMalloc(nArgs * sizeof(void *), "new extended args");
   }
   *GlobalArgs = SharedMemVariableSharingSpacePtr;
 }

 // End a data sharing context. There is no need to have a list of refs
 // to shared variables because the context in which those variables were
 // shared has now ended. This should clean-up the list of references only
 // without affecting the actual global storage of the variables.
 // In L0 data sharing this is called by master thread.
 // In L1 data sharing this is called by active warp master thread.
 EXTERN void __kmpc_end_sharing_variables() {
   if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
     SafeFree(SharedMemVariableSharingSpacePtr, "new extended args");
 }

 // This function will return a list of references to global variables. This
 // is how the workers will get a reference to the globalized variable. The
 // members of this list will be passed to the outlined parallel function
 // preserving the order.
 // Called by all workers.
 EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
   *GlobalArgs = SharedMemVariableSharingSpacePtr;
 }

 // This function is used to init static memory manager. This manager is used to
 // manage statically allocated global memory. This memory is allocated by the
 // compiler and used to correctly implement globalization of the variables in
 // target, teams and distribute regions.
 EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
                                           const void *buf, size_t size,
                                           int16_t is_shared,
                                           const void **frame) {
   if (is_shared) {
     *frame = buf;
     return;
   }
   if (isSPMDExecutionMode) {
     if (__kmpc_get_hardware_thread_id_in_block() == 0) {
       *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
     }
     __kmpc_impl_syncthreads();
     return;
   }
   ASSERT0(LT_FUSSY,
           __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
           "Must be called only in the target master thread.");
   *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
   __kmpc_impl_threadfence();
 }

 EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
                                               int16_t is_shared) {
   if (is_shared)
     return;
   if (isSPMDExecutionMode) {
     __kmpc_impl_syncthreads();
     if (__kmpc_get_hardware_thread_id_in_block() == 0) {
       omptarget_nvptx_simpleMemoryManager.Release();
     }
     return;
   }
   __kmpc_impl_threadfence();
   ASSERT0(LT_FUSSY,
           __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
           "Must be called only in the target master thread.");
   omptarget_nvptx_simpleMemoryManager.Release();
 }

 #pragma omp end declare target
	//===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the implementation of data sharing environments
	//
	//===----------------------------------------------------------------------===//
	#pragma omp declare target

	#include "common/omptarget.h"
	#include "target/shuffle.h"
	#include "target_impl.h"

	////////////////////////////////////////////////////////////////////////////////
	// Runtime functions for trunk data sharing scheme.
	////////////////////////////////////////////////////////////////////////////////

	static constexpr unsigned MinBytes = 8;

	static constexpr unsigned Alignment = 8;

	/// External symbol to access dynamic shared memory.
	extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
	#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)

	EXTERN void *__kmpc_get_dynamic_shared() { return DynamicSharedBuffer; }

	EXTERN void *llvm_omp_get_dynamic_shared() {
	return __kmpc_get_dynamic_shared();
	}

	template <unsigned BPerThread, unsigned NThreads = MAX_THREADS_PER_TEAM>
	struct alignas(32) ThreadStackTy {
	static constexpr unsigned BytesPerThread = BPerThread;
	static constexpr unsigned NumThreads = NThreads;
	static constexpr unsigned NumWarps = (NThreads + WARPSIZE - 1) / WARPSIZE;

	unsigned char Data[NumThreads][BytesPerThread];
	unsigned char Usage[NumThreads];
	};

	[[clang::loader_uninitialized]] ThreadStackTy<MinBytes * 8, 1> MainSharedStack;
	#pragma omp allocate(MainSharedStack) allocator(omp_pteam_mem_alloc)

	[[clang::loader_uninitialized]] ThreadStackTy<MinBytes,
	MAX_THREADS_PER_TEAM / 4>
	WorkerSharedStack;
	#pragma omp allocate(WorkerSharedStack) allocator(omp_pteam_mem_alloc)

	EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
	size_t AlignedBytes = Bytes + (Bytes % MinBytes);
	int TID = __kmpc_get_hardware_thread_id_in_block();
	if (__kmpc_is_generic_main_thread(TID)) {
	// Main thread alone, use shared memory if space is available.
	if (MainSharedStack.Usage[0] + AlignedBytes <= MainSharedStack.BytesPerThread) {
	void *Ptr = &MainSharedStack.Data[0][MainSharedStack.Usage[0]];
	MainSharedStack.Usage[0] += AlignedBytes;
	return Ptr;
	}
	} else if (TID < WorkerSharedStack.NumThreads) {
	if (WorkerSharedStack.Usage[TID] + AlignedBytes <= WorkerSharedStack.BytesPerThread) {
	void *Ptr = &WorkerSharedStack.Data[TID][WorkerSharedStack.Usage[TID]];
	WorkerSharedStack.Usage[TID] += AlignedBytes;
	return Ptr;
	}
	}
	// Fallback to malloc
	return SafeMalloc(Bytes, "AllocGlobalFallback");
	}

	EXTERN void __kmpc_free_shared(void *Ptr, size_t Bytes) {
	size_t AlignedBytes = Bytes + (Bytes % MinBytes);
	int TID = __kmpc_get_hardware_thread_id_in_block();
	if (__kmpc_is_generic_main_thread(TID)) {
	if (Ptr >= &MainSharedStack.Data[0][0] &&
	Ptr < &MainSharedStack.Data[MainSharedStack.NumThreads][0]) {
	MainSharedStack.Usage[0] -= AlignedBytes;
	return;
	}
	} else if (TID < WorkerSharedStack.NumThreads) {
	if (Ptr >= &WorkerSharedStack.Data[0][0] &&
	Ptr < &WorkerSharedStack.Data[WorkerSharedStack.NumThreads][0]) {
	int TID = __kmpc_get_hardware_thread_id_in_block();
	WorkerSharedStack.Usage[TID] -= AlignedBytes;
	return;
	}
	}
	SafeFree(Ptr, "FreeGlobalFallback");
	}

	EXTERN void __kmpc_data_sharing_init_stack() {
	for (unsigned i = 0; i < MainSharedStack.NumWarps; ++i)
	MainSharedStack.Usage[i] = 0;
	for (unsigned i = 0; i < WorkerSharedStack.NumThreads; ++i)
	WorkerSharedStack.Usage[i] = 0;
	}

	/// Allocate storage in shared memory to communicate arguments from the main
	/// thread to the workers in generic mode. If we exceed
	/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
	#define NUM_SHARED_VARIABLES_IN_SHARED_MEM 64

	[[clang::loader_uninitialized]] static void
	*SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
	#pragma omp allocate(SharedMemVariableSharingSpace) \
	allocator(omp_pteam_mem_alloc)
	[[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
	#pragma omp allocate(SharedMemVariableSharingSpacePtr) \
	allocator(omp_pteam_mem_alloc)

	// Begin a data sharing context. Maintain a list of references to shared
	// variables. This list of references to shared variables will be passed
	// to one or more threads.
	// In L0 data sharing this is called by master thread.
	// In L1 data sharing this is called by active warp master thread.
	EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
	if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
	SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
	} else {
	SharedMemVariableSharingSpacePtr =
	(void *)SafeMalloc(nArgs sizeof(void *), "new extended args");
	}
	*GlobalArgs = SharedMemVariableSharingSpacePtr;
	}

	// End a data sharing context. There is no need to have a list of refs
	// to shared variables because the context in which those variables were
	// shared has now ended. This should clean-up the list of references only
	// without affecting the actual global storage of the variables.
	// In L0 data sharing this is called by master thread.
	// In L1 data sharing this is called by active warp master thread.
	EXTERN void __kmpc_end_sharing_variables() {
	if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
	SafeFree(SharedMemVariableSharingSpacePtr, "new extended args");
	}

	// This function will return a list of references to global variables. This
	// is how the workers will get a reference to the globalized variable. The
	// members of this list will be passed to the outlined parallel function
	// preserving the order.
	// Called by all workers.
	EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
	*GlobalArgs = SharedMemVariableSharingSpacePtr;
	}

	// This function is used to init static memory manager. This manager is used to
	// manage statically allocated global memory. This memory is allocated by the
	// compiler and used to correctly implement globalization of the variables in
	// target, teams and distribute regions.
	EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
	const void *buf, size_t size,
	int16_t is_shared,
	const void **frame) {
	if (is_shared) {
	*frame = buf;
	return;
	}
	if (isSPMDExecutionMode) {
	if (__kmpc_get_hardware_thread_id_in_block() == 0) {
	*frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
	}
	__kmpc_impl_syncthreads();
	return;
	}
	ASSERT0(LT_FUSSY,
	__kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
	"Must be called only in the target master thread.");
	*frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
	__kmpc_impl_threadfence();
	}

	EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
	int16_t is_shared) {
	if (is_shared)
	return;
	if (isSPMDExecutionMode) {
	__kmpc_impl_syncthreads();
	if (__kmpc_get_hardware_thread_id_in_block() == 0) {
	omptarget_nvptx_simpleMemoryManager.Release();
	}
	return;
	}
	__kmpc_impl_threadfence();
	ASSERT0(LT_FUSSY,
	__kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
	"Must be called only in the target master thread.");
	omptarget_nvptx_simpleMemoryManager.Release();
	}

	#pragma omp end declare target