openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h - llvm-project - Git at Google

 //===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // Definitions of target specific functions
 //
 //===----------------------------------------------------------------------===//
 #ifndef _TARGET_IMPL_H_
 #define _TARGET_IMPL_H_

 #include "nvptx_interface.h"

 #include <stddef.h>
 #include <stdint.h>

 // subset of inttypes.h
 #define PRId64 "ld"
 #define PRIu64 "lu"

 typedef uint32_t __kmpc_impl_lanemask_t;

 #define INLINE inline __attribute__((always_inline))
 #define NOINLINE __attribute__((noinline))
 #define ALIGN(N) __attribute__((aligned(N)))
 #define PLUGIN_ACCESSIBLE /* no annotation needed for cuda plugin */

 #include "llvm/Frontend/OpenMP/OMPGridValues.h"

 INLINE constexpr const llvm::omp::GV &getGridValue() {
   return llvm::omp::NVPTXGridValues;
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Kernel options
 ////////////////////////////////////////////////////////////////////////////////

 ////////////////////////////////////////////////////////////////////////////////
 // The following def must match the absolute limit hardwired in the host RTL
 // max number of threads per team
 enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size };
 enum { WARPSIZE = getGridValue().GV_Warp_Size };

 // Maximum number of omp state objects per SM allocated statically in global
 // memory.
 #if __CUDA_ARCH__ >= 600
 #define OMP_STATE_COUNT 32
 #else
 #define OMP_STATE_COUNT 16
 #endif

 #if !defined(MAX_SM)
 #if __CUDA_ARCH__ >= 900
 #error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option
 #elif __CUDA_ARCH__ >= 800
 // GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs
 // GA102 design has a maxinum of 84 SMs
 #define MAX_SM 108
 #elif __CUDA_ARCH__ >= 700
 #define MAX_SM 84
 #elif __CUDA_ARCH__ >= 600
 #define MAX_SM 56
 #else
 #define MAX_SM 16
 #endif
 #endif

 #define OMP_ACTIVE_PARALLEL_LEVEL 128

 // Data sharing related quantities, need to match what is used in the compiler.
 enum DATA_SHARING_SIZES {
   // The size reserved for data in a shared memory slot.
   DS_Slot_Size = getGridValue().GV_Slot_Size,
   // The slot size that should be reserved for a working warp.
   DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(),
   // The maximum number of warps in use
   DS_Max_Warp_Number = getGridValue().maxWarpNumber(),
 };

 enum : __kmpc_impl_lanemask_t {
   __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
 };

 #define printf(...)

 #endif
	//===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// Definitions of target specific functions
	//
	//===----------------------------------------------------------------------===//
	#ifndef _TARGET_IMPL_H_
	#define _TARGET_IMPL_H_

	#include "nvptx_interface.h"

	#include <stddef.h>
	#include <stdint.h>

	// subset of inttypes.h
	#define PRId64 "ld"
	#define PRIu64 "lu"

	typedef uint32_t __kmpc_impl_lanemask_t;

	#define INLINE inline __attribute__((always_inline))
	#define NOINLINE __attribute__((noinline))
	#define ALIGN(N) __attribute__((aligned(N)))
	#define PLUGIN_ACCESSIBLE /* no annotation needed for cuda plugin */

	#include "llvm/Frontend/OpenMP/OMPGridValues.h"

	INLINE constexpr const llvm::omp::GV &getGridValue() {
	return llvm::omp::NVPTXGridValues;
	}

	////////////////////////////////////////////////////////////////////////////////
	// Kernel options
	////////////////////////////////////////////////////////////////////////////////

	////////////////////////////////////////////////////////////////////////////////
	// The following def must match the absolute limit hardwired in the host RTL
	// max number of threads per team
	enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size };
	enum { WARPSIZE = getGridValue().GV_Warp_Size };

	// Maximum number of omp state objects per SM allocated statically in global
	// memory.
	#if __CUDA_ARCH__ >= 600
	#define OMP_STATE_COUNT 32
	#else
	#define OMP_STATE_COUNT 16
	#endif

	#if !defined(MAX_SM)
	#if __CUDA_ARCH__ >= 900
	#error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option
	#elif __CUDA_ARCH__ >= 800
	// GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs
	// GA102 design has a maxinum of 84 SMs
	#define MAX_SM 108
	#elif __CUDA_ARCH__ >= 700
	#define MAX_SM 84
	#elif __CUDA_ARCH__ >= 600
	#define MAX_SM 56
	#else
	#define MAX_SM 16
	#endif
	#endif

	#define OMP_ACTIVE_PARALLEL_LEVEL 128

	// Data sharing related quantities, need to match what is used in the compiler.
	enum DATA_SHARING_SIZES {
	// The size reserved for data in a shared memory slot.
	DS_Slot_Size = getGridValue().GV_Slot_Size,
	// The slot size that should be reserved for a working warp.
	DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(),
	// The maximum number of warps in use
	DS_Max_Warp_Number = getGridValue().maxWarpNumber(),
	};

	enum : __kmpc_impl_lanemask_t {
	__kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
	};

	#define printf(...)

	#endif