libomptarget/deviceRTLs/nvptx/src/target_impl.h - llvm-project/openmp - Git at Google

 //===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // Definitions of target specific functions
 //
 //===----------------------------------------------------------------------===//
 #ifndef _TARGET_IMPL_H_
 #define _TARGET_IMPL_H_

 #include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>

 #include "nvptx_interface.h"

 #define INLINE inline __attribute__((always_inline))
 #define NOINLINE __attribute__((noinline))
 #define ALIGN(N) __attribute__((aligned(N)))

 ////////////////////////////////////////////////////////////////////////////////
 // Kernel options
 ////////////////////////////////////////////////////////////////////////////////

 ////////////////////////////////////////////////////////////////////////////////
 // The following def must match the absolute limit hardwired in the host RTL
 // max number of threads per team
 #define MAX_THREADS_PER_TEAM 1024

 #define WARPSIZE 32

 // Maximum number of preallocated arguments to an outlined parallel/simd
 // function. Anything more requires dynamic memory allocation.
 #define MAX_SHARED_ARGS 20

 // Maximum number of omp state objects per SM allocated statically in global
 // memory.
 #if __CUDA_ARCH__ >= 600
 #define OMP_STATE_COUNT 32
 #else
 #define OMP_STATE_COUNT 16
 #endif

 #if !defined(MAX_SM)
 #if __CUDA_ARCH__ >= 900
 #error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option
 #elif __CUDA_ARCH__ >= 800
 // GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs
 // GA102 design has a maxinum of 84 SMs
 #define MAX_SM 108
 #elif __CUDA_ARCH__ >= 700
 #define MAX_SM 84
 #elif __CUDA_ARCH__ >= 600
 #define MAX_SM 56
 #else
 #define MAX_SM 16
 #endif
 #endif

 #define OMP_ACTIVE_PARALLEL_LEVEL 128

 // Data sharing related quantities, need to match what is used in the compiler.
 enum DATA_SHARING_SIZES {
   // The maximum number of workers in a kernel.
   DS_Max_Worker_Threads = 992,
   // The size reserved for data in a shared memory slot.
   DS_Slot_Size = 256,
   // The slot size that should be reserved for a working warp.
   DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
   // The maximum number of warps in use
   DS_Max_Warp_Number = 32,
   // The size of the preallocated shared memory buffer per team
   DS_Shared_Memory_Size = 128,
 };

 enum : __kmpc_impl_lanemask_t {
   __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
 };

 #endif
	//===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// Definitions of target specific functions
	//
	//===----------------------------------------------------------------------===//
	#ifndef _TARGET_IMPL_H_
	#define _TARGET_IMPL_H_

	#include <inttypes.h>
	#include <stdio.h>
	#include <stdlib.h>

	#include "nvptx_interface.h"

	#define INLINE inline __attribute__((always_inline))
	#define NOINLINE __attribute__((noinline))
	#define ALIGN(N) __attribute__((aligned(N)))

	////////////////////////////////////////////////////////////////////////////////
	// Kernel options
	////////////////////////////////////////////////////////////////////////////////

	////////////////////////////////////////////////////////////////////////////////
	// The following def must match the absolute limit hardwired in the host RTL
	// max number of threads per team
	#define MAX_THREADS_PER_TEAM 1024

	#define WARPSIZE 32

	// Maximum number of preallocated arguments to an outlined parallel/simd
	// function. Anything more requires dynamic memory allocation.
	#define MAX_SHARED_ARGS 20

	// Maximum number of omp state objects per SM allocated statically in global
	// memory.
	#if __CUDA_ARCH__ >= 600
	#define OMP_STATE_COUNT 32
	#else
	#define OMP_STATE_COUNT 16
	#endif

	#if !defined(MAX_SM)
	#if __CUDA_ARCH__ >= 900
	#error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option
	#elif __CUDA_ARCH__ >= 800
	// GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs
	// GA102 design has a maxinum of 84 SMs
	#define MAX_SM 108
	#elif __CUDA_ARCH__ >= 700
	#define MAX_SM 84
	#elif __CUDA_ARCH__ >= 600
	#define MAX_SM 56
	#else
	#define MAX_SM 16
	#endif
	#endif

	#define OMP_ACTIVE_PARALLEL_LEVEL 128

	// Data sharing related quantities, need to match what is used in the compiler.
	enum DATA_SHARING_SIZES {
	// The maximum number of workers in a kernel.
	DS_Max_Worker_Threads = 992,
	// The size reserved for data in a shared memory slot.
	DS_Slot_Size = 256,
	// The slot size that should be reserved for a working warp.
	DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
	// The maximum number of warps in use
	DS_Max_Warp_Number = 32,
	// The size of the preallocated shared memory buffer per team
	DS_Shared_Memory_Size = 128,
	};

	enum : __kmpc_impl_lanemask_t {
	__kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
	};

	#endif