| //===-- CUDAIntrinsicCall.cpp ---------------------------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // Helper routines for constructing the FIR dialect of MLIR for PowerPC |
| // intrinsics. Extensive use of MLIR interfaces and MLIR's coding style |
| // (https://mlir.llvm.org/getting_started/DeveloperGuide/) is used in this |
| // module. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "flang/Optimizer/Builder/CUDAIntrinsicCall.h" |
| #include "flang/Evaluate/common.h" |
| #include "flang/Optimizer/Builder/FIRBuilder.h" |
| #include "flang/Optimizer/Builder/MutableBox.h" |
| #include "mlir/Dialect/Index/IR/IndexOps.h" |
| #include "mlir/Dialect/SCF/IR/SCF.h" |
| #include "mlir/Dialect/Vector/IR/VectorOps.h" |
| |
| namespace fir { |
| |
| using CI = CUDAIntrinsicLibrary; |
| |
| static const char __ldca_i4x4[] = "__ldca_i4x4_"; |
| static const char __ldca_i8x2[] = "__ldca_i8x2_"; |
| static const char __ldca_r2x2[] = "__ldca_r2x2_"; |
| static const char __ldca_r4x4[] = "__ldca_r4x4_"; |
| static const char __ldca_r8x2[] = "__ldca_r8x2_"; |
| static const char __ldcg_i4x4[] = "__ldcg_i4x4_"; |
| static const char __ldcg_i8x2[] = "__ldcg_i8x2_"; |
| static const char __ldcg_r2x2[] = "__ldcg_r2x2_"; |
| static const char __ldcg_r4x4[] = "__ldcg_r4x4_"; |
| static const char __ldcg_r8x2[] = "__ldcg_r8x2_"; |
| static const char __ldcs_i4x4[] = "__ldcs_i4x4_"; |
| static const char __ldcs_i8x2[] = "__ldcs_i8x2_"; |
| static const char __ldcs_r2x2[] = "__ldcs_r2x2_"; |
| static const char __ldcs_r4x4[] = "__ldcs_r4x4_"; |
| static const char __ldcs_r8x2[] = "__ldcs_r8x2_"; |
| static const char __ldcv_i4x4[] = "__ldcv_i4x4_"; |
| static const char __ldcv_i8x2[] = "__ldcv_i8x2_"; |
| static const char __ldcv_r2x2[] = "__ldcv_r2x2_"; |
| static const char __ldcv_r4x4[] = "__ldcv_r4x4_"; |
| static const char __ldcv_r8x2[] = "__ldcv_r8x2_"; |
| static const char __ldlu_i4x4[] = "__ldlu_i4x4_"; |
| static const char __ldlu_i8x2[] = "__ldlu_i8x2_"; |
| static const char __ldlu_r2x2[] = "__ldlu_r2x2_"; |
| static const char __ldlu_r4x4[] = "__ldlu_r4x4_"; |
| static const char __ldlu_r8x2[] = "__ldlu_r8x2_"; |
| |
| // CUDA specific intrinsic handlers. |
| static constexpr IntrinsicHandler cudaHandlers[]{ |
| {"__ldca_i4x4", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldca_i4x4, 4>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldca_i8x2", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldca_i8x2, 2>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldca_r2x2", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldca_r2x2, 2>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldca_r4x4", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldca_r4x4, 4>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldca_r8x2", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldca_r8x2, 2>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldcg_i4x4", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldcg_i4x4, 4>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldcg_i8x2", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldcg_i8x2, 2>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldcg_r2x2", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldcg_r2x2, 2>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldcg_r4x4", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldcg_r4x4, 4>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldcg_r8x2", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldcg_r8x2, 2>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldcs_i4x4", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldcs_i4x4, 4>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldcs_i8x2", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldcs_i8x2, 2>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldcs_r2x2", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldcs_r2x2, 2>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldcs_r4x4", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldcs_r4x4, 4>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldcs_r8x2", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldcs_r8x2, 2>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldcv_i4x4", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldcv_i4x4, 4>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldcv_i8x2", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldcv_i8x2, 2>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldcv_r2x2", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldcv_r2x2, 2>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldcv_r4x4", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldcv_r4x4, 4>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldcv_r8x2", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldcv_r8x2, 2>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldlu_i4x4", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldlu_i4x4, 4>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldlu_i8x2", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldlu_i8x2, 2>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldlu_r2x2", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldlu_r2x2, 2>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldlu_r4x4", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldlu_r4x4, 4>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"__ldlu_r8x2", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genLDXXFunc<__ldlu_r8x2, 2>), |
| {{{"a", asAddr}}}, |
| /*isElemental=*/false}, |
| {"all_sync", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genVoteSync<mlir::NVVM::VoteSyncKind::all>), |
| {{{"mask", asValue}, {"pred", asValue}}}, |
| /*isElemental=*/false}, |
| {"any_sync", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genVoteSync<mlir::NVVM::VoteSyncKind::any>), |
| {{{"mask", asValue}, {"pred", asValue}}}, |
| /*isElemental=*/false}, |
| {"atomicadd_r4x2", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genAtomicAddVector<2>), |
| {{{"a", asAddr}, {"v", asAddr}}}, |
| false}, |
| {"atomicadd_r4x4", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genAtomicAddVector<4>), |
| {{{"a", asAddr}, {"v", asAddr}}}, |
| false}, |
| {"atomicaddd", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicaddf", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicaddi", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicaddl", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicaddr2", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicAddR2), |
| {{{"a", asAddr}, {"v", asAddr}}}, |
| false}, |
| {"atomicaddvector_r2x2", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genAtomicAddVector<2>), |
| {{{"a", asAddr}, {"v", asAddr}}}, |
| false}, |
| {"atomicaddvector_r4x2", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( |
| &CI::genAtomicAddVector<2>), |
| {{{"a", asAddr}, {"v", asAddr}}}, |
| false}, |
| {"atomicandi", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAnd), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomiccasd", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas), |
| {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, |
| false}, |
| {"atomiccasf", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas), |
| {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, |
| false}, |
| {"atomiccasi", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas), |
| {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, |
| false}, |
| {"atomiccasul", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas), |
| {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, |
| false}, |
| {"atomicdeci", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicDec), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicexchd", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicexchf", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicexchi", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicexchul", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicinci", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicInc), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicmaxd", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicmaxf", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicmaxi", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicmaxl", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicmind", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicminf", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicmini", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicminl", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicori", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicOr), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicsubd", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicsubf", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicsubi", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicsubl", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"atomicxori", |
| static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicXor), |
| {{{"a", asAddr}, {"v", asValue}}}, |
| false}, |
| {"ballot_sync", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genVoteSync<mlir::NVVM::VoteSyncKind::ballot>), |
| {{{"mask", asValue}, {"pred", asValue}}}, |
| /*isElemental=*/false}, |
| {"barrier_arrive", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genBarrierArrive), |
| {{{"barrier", asAddr}}}, |
| /*isElemental=*/false}, |
| {"barrier_arrive_cnt", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genBarrierArriveCnt), |
| {{{"barrier", asAddr}, {"count", asValue}}}, |
| /*isElemental=*/false}, |
| {"barrier_init", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genBarrierInit), |
| {{{"barrier", asAddr}, {"count", asValue}}}, |
| /*isElemental=*/false}, |
| {"barrier_try_wait", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genBarrierTryWait), |
| {{{"barrier", asAddr}, {"token", asValue}}}, |
| /*isElemental=*/false}, |
| {"barrier_try_wait_sleep", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genBarrierTryWaitSleep), |
| {{{"barrier", asAddr}, {"token", asValue}, {"ns", asValue}}}, |
| /*isElemental=*/false}, |
| {"clock", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genNVVMTime<mlir::NVVM::ClockOp>), |
| {}, |
| /*isElemental=*/false}, |
| {"clock64", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genNVVMTime<mlir::NVVM::Clock64Op>), |
| {}, |
| /*isElemental=*/false}, |
| {"fence_proxy_async", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genFenceProxyAsync), |
| {}, |
| /*isElemental=*/false}, |
| {"globaltimer", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genNVVMTime<mlir::NVVM::GlobalTimerOp>), |
| {}, |
| /*isElemental=*/false}, |
| {"match_all_syncjd", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genMatchAllSync), |
| {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, |
| /*isElemental=*/false}, |
| {"match_all_syncjf", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genMatchAllSync), |
| {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, |
| /*isElemental=*/false}, |
| {"match_all_syncjj", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genMatchAllSync), |
| {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, |
| /*isElemental=*/false}, |
| {"match_all_syncjx", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genMatchAllSync), |
| {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, |
| /*isElemental=*/false}, |
| {"match_any_syncjd", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genMatchAnySync), |
| {{{"mask", asValue}, {"value", asValue}}}, |
| /*isElemental=*/false}, |
| {"match_any_syncjf", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genMatchAnySync), |
| {{{"mask", asValue}, {"value", asValue}}}, |
| /*isElemental=*/false}, |
| {"match_any_syncjj", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genMatchAnySync), |
| {{{"mask", asValue}, {"value", asValue}}}, |
| /*isElemental=*/false}, |
| {"match_any_syncjx", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genMatchAnySync), |
| {{{"mask", asValue}, {"value", asValue}}}, |
| /*isElemental=*/false}, |
| {"syncthreads", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genSyncThreads), |
| {}, |
| /*isElemental=*/false}, |
| {"syncthreads_and_i4", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genSyncThreadsAnd), |
| {}, |
| /*isElemental=*/false}, |
| {"syncthreads_and_l4", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genSyncThreadsAnd), |
| {}, |
| /*isElemental=*/false}, |
| {"syncthreads_count_i4", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genSyncThreadsCount), |
| {}, |
| /*isElemental=*/false}, |
| {"syncthreads_count_l4", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genSyncThreadsCount), |
| {}, |
| /*isElemental=*/false}, |
| {"syncthreads_or_i4", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genSyncThreadsOr), |
| {}, |
| /*isElemental=*/false}, |
| {"syncthreads_or_l4", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genSyncThreadsOr), |
| {}, |
| /*isElemental=*/false}, |
| {"syncwarp", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genSyncWarp), |
| {}, |
| /*isElemental=*/false}, |
| {"this_grid", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisGrid), |
| {}, |
| /*isElemental=*/false}, |
| {"this_thread_block", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( |
| &CI::genThisThreadBlock), |
| {}, |
| /*isElemental=*/false}, |
| {"this_warp", |
| static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisWarp), |
| {}, |
| /*isElemental=*/false}, |
| {"threadfence", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genThreadFence), |
| {}, |
| /*isElemental=*/false}, |
| {"threadfence_block", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genThreadFenceBlock), |
| {}, |
| /*isElemental=*/false}, |
| {"threadfence_system", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genThreadFenceSystem), |
| {}, |
| /*isElemental=*/false}, |
| {"tma_bulk_commit_group", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genTMABulkCommitGroup), |
| {{}}, |
| /*isElemental=*/false}, |
| {"tma_bulk_g2s", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genTMABulkG2S), |
| {{{"barrier", asAddr}, |
| {"src", asAddr}, |
| {"dst", asAddr}, |
| {"nbytes", asValue}}}, |
| /*isElemental=*/false}, |
| {"tma_bulk_ldc4", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genTMABulkLoadC4), |
| {{{"barrier", asAddr}, |
| {"src", asAddr}, |
| {"dst", asAddr}, |
| {"nelems", asValue}}}, |
| /*isElemental=*/false}, |
| {"tma_bulk_ldc8", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genTMABulkLoadC8), |
| {{{"barrier", asAddr}, |
| {"src", asAddr}, |
| {"dst", asAddr}, |
| {"nelems", asValue}}}, |
| /*isElemental=*/false}, |
| {"tma_bulk_ldi4", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genTMABulkLoadI4), |
| {{{"barrier", asAddr}, |
| {"src", asAddr}, |
| {"dst", asAddr}, |
| {"nelems", asValue}}}, |
| /*isElemental=*/false}, |
| {"tma_bulk_ldi8", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genTMABulkLoadI8), |
| {{{"barrier", asAddr}, |
| {"src", asAddr}, |
| {"dst", asAddr}, |
| {"nelems", asValue}}}, |
| /*isElemental=*/false}, |
| {"tma_bulk_ldr2", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genTMABulkLoadR2), |
| {{{"barrier", asAddr}, |
| {"src", asAddr}, |
| {"dst", asAddr}, |
| {"nelems", asValue}}}, |
| /*isElemental=*/false}, |
| {"tma_bulk_ldr4", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genTMABulkLoadR4), |
| {{{"barrier", asAddr}, |
| {"src", asAddr}, |
| {"dst", asAddr}, |
| {"nelems", asValue}}}, |
| /*isElemental=*/false}, |
| {"tma_bulk_ldr8", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genTMABulkLoadR8), |
| {{{"barrier", asAddr}, |
| {"src", asAddr}, |
| {"dst", asAddr}, |
| {"nelems", asValue}}}, |
| /*isElemental=*/false}, |
| {"tma_bulk_s2g", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genTMABulkS2G), |
| {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}}, |
| /*isElemental=*/false}, |
| {"tma_bulk_store_c4", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genTMABulkStoreC4), |
| {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, |
| /*isElemental=*/false}, |
| {"tma_bulk_store_c8", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genTMABulkStoreC8), |
| {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, |
| /*isElemental=*/false}, |
| {"tma_bulk_store_i4", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genTMABulkStoreI4), |
| {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, |
| /*isElemental=*/false}, |
| {"tma_bulk_store_i8", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genTMABulkStoreI8), |
| {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, |
| /*isElemental=*/false}, |
| {"tma_bulk_store_r2", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genTMABulkStoreR2), |
| {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, |
| /*isElemental=*/false}, |
| {"tma_bulk_store_r4", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genTMABulkStoreR4), |
| {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, |
| /*isElemental=*/false}, |
| {"tma_bulk_store_r8", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genTMABulkStoreR8), |
| {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, |
| /*isElemental=*/false}, |
| {"tma_bulk_wait_group", |
| static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( |
| &CI::genTMABulkWaitGroup), |
| {{}}, |
| /*isElemental=*/false}, |
| }; |
| |
| template <std::size_t N> |
| static constexpr bool isSorted(const IntrinsicHandler (&array)[N]) { |
| // Replace by std::sorted when C++20 is default (will be constexpr). |
| const IntrinsicHandler *lastSeen{nullptr}; |
| bool isSorted{true}; |
| for (const auto &x : array) { |
| if (lastSeen) |
| isSorted &= std::string_view{lastSeen->name} < std::string_view{x.name}; |
| lastSeen = &x; |
| } |
| return isSorted; |
| } |
| static_assert(isSorted(cudaHandlers) && "map must be sorted"); |
| |
| const IntrinsicHandler *findCUDAIntrinsicHandler(llvm::StringRef name) { |
| auto compare = [](const IntrinsicHandler &cudaHandler, llvm::StringRef name) { |
| return name.compare(cudaHandler.name) > 0; |
| }; |
| auto result = llvm::lower_bound(cudaHandlers, name, compare); |
| return result != std::end(cudaHandlers) && result->name == name ? result |
| : nullptr; |
| } |
| |
| static mlir::Value convertPtrToNVVMSpace(fir::FirOpBuilder &builder, |
| mlir::Location loc, |
| mlir::Value barrier, |
| mlir::NVVM::NVVMMemorySpace space) { |
| mlir::Value llvmPtr = fir::ConvertOp::create( |
| builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()), |
| barrier); |
| mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create( |
| builder, loc, |
| mlir::LLVM::LLVMPointerType::get(builder.getContext(), |
| static_cast<unsigned>(space)), |
| llvmPtr); |
| return addrCast; |
| } |
| |
| static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc, |
| mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0, |
| mlir::Value arg1) { |
| auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext()); |
| arg0 = builder.createConvert(loc, llvmPointerType, arg0); |
| return mlir::LLVM::AtomicRMWOp::create(builder, loc, binOp, arg0, arg1, |
| mlir::LLVM::AtomicOrdering::seq_cst); |
| } |
| |
| // ATOMICADD |
| mlir::Value |
| CUDAIntrinsicLibrary::genAtomicAdd(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| assert(args.size() == 2); |
| mlir::LLVM::AtomicBinOp binOp = |
| mlir::isa<mlir::IntegerType>(args[1].getType()) |
| ? mlir::LLVM::AtomicBinOp::add |
| : mlir::LLVM::AtomicBinOp::fadd; |
| return genAtomBinOp(builder, loc, binOp, args[0], args[1]); |
| } |
| |
| fir::ExtendedValue |
| CUDAIntrinsicLibrary::genAtomicAddR2(mlir::Type resultType, |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 2); |
| |
| mlir::Value a = fir::getBase(args[0]); |
| |
| if (mlir::isa<fir::BaseBoxType>(a.getType())) { |
| a = fir::BoxAddrOp::create(builder, loc, a); |
| } |
| |
| auto loc = builder.getUnknownLoc(); |
| auto f16Ty = builder.getF16Type(); |
| auto i32Ty = builder.getI32Type(); |
| auto vecF16Ty = mlir::VectorType::get({2}, f16Ty); |
| mlir::Type idxTy = builder.getIndexType(); |
| auto f16RefTy = fir::ReferenceType::get(f16Ty); |
| auto zero = builder.createIntegerConstant(loc, idxTy, 0); |
| auto one = builder.createIntegerConstant(loc, idxTy, 1); |
| auto v1Coord = fir::CoordinateOp::create(builder, loc, f16RefTy, |
| fir::getBase(args[1]), zero); |
| auto v2Coord = fir::CoordinateOp::create(builder, loc, f16RefTy, |
| fir::getBase(args[1]), one); |
| auto v1 = fir::LoadOp::create(builder, loc, v1Coord); |
| auto v2 = fir::LoadOp::create(builder, loc, v2Coord); |
| mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecF16Ty); |
| mlir::Value vec1 = mlir::LLVM::InsertElementOp::create( |
| builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0)); |
| mlir::Value vec2 = mlir::LLVM::InsertElementOp::create( |
| builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1)); |
| auto res = genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2); |
| auto i32VecTy = mlir::VectorType::get({1}, i32Ty); |
| mlir::Value vecI32 = |
| mlir::vector::BitCastOp::create(builder, loc, i32VecTy, res); |
| return mlir::vector::ExtractOp::create(builder, loc, vecI32, |
| mlir::ArrayRef<int64_t>{0}); |
| } |
| |
| // ATOMICADDVECTOR |
| template <int extent> |
| fir::ExtendedValue CUDAIntrinsicLibrary::genAtomicAddVector( |
| mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 2); |
| mlir::Value res = fir::AllocaOp::create( |
| builder, loc, fir::SequenceType::get({extent}, resultType)); |
| mlir::Value a = fir::getBase(args[0]); |
| if (mlir::isa<fir::BaseBoxType>(a.getType())) { |
| a = fir::BoxAddrOp::create(builder, loc, a); |
| } |
| auto vecTy = mlir::VectorType::get({extent}, resultType); |
| auto refTy = fir::ReferenceType::get(resultType); |
| mlir::Type i32Ty = builder.getI32Type(); |
| mlir::Type idxTy = builder.getIndexType(); |
| |
| // Extract the values from the array. |
| llvm::SmallVector<mlir::Value> values; |
| for (unsigned i = 0; i < extent; ++i) { |
| mlir::Value pos = builder.createIntegerConstant(loc, idxTy, i); |
| mlir::Value coord = fir::CoordinateOp::create(builder, loc, refTy, |
| fir::getBase(args[1]), pos); |
| mlir::Value value = fir::LoadOp::create(builder, loc, coord); |
| values.push_back(value); |
| } |
| // Pack extracted values into a vector to call the atomic add. |
| mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecTy); |
| for (unsigned i = 0; i < extent; ++i) { |
| mlir::Value insert = mlir::LLVM::InsertElementOp::create( |
| builder, loc, undef, values[i], |
| builder.createIntegerConstant(loc, i32Ty, i)); |
| undef = insert; |
| } |
| // Atomic operation with a vector of values. |
| mlir::Value add = |
| genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, undef); |
| // Store results in the result array. |
| for (unsigned i = 0; i < extent; ++i) { |
| mlir::Value r = mlir::LLVM::ExtractElementOp::create( |
| builder, loc, add, builder.createIntegerConstant(loc, i32Ty, i)); |
| mlir::Value c = fir::CoordinateOp::create( |
| builder, loc, refTy, res, builder.createIntegerConstant(loc, idxTy, i)); |
| fir::StoreOp::create(builder, loc, r, c); |
| } |
| mlir::Value ext = builder.createIntegerConstant(loc, idxTy, extent); |
| return fir::ArrayBoxValue(res, {ext}); |
| } |
| |
| mlir::Value |
| CUDAIntrinsicLibrary::genAtomicAnd(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| assert(args.size() == 2); |
| assert(mlir::isa<mlir::IntegerType>(args[1].getType())); |
| |
| mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_and; |
| return genAtomBinOp(builder, loc, binOp, args[0], args[1]); |
| } |
| |
| mlir::Value |
| CUDAIntrinsicLibrary::genAtomicOr(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| assert(args.size() == 2); |
| assert(mlir::isa<mlir::IntegerType>(args[1].getType())); |
| |
| mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_or; |
| return genAtomBinOp(builder, loc, binOp, args[0], args[1]); |
| } |
| |
| // ATOMICCAS |
| fir::ExtendedValue |
| CUDAIntrinsicLibrary::genAtomicCas(mlir::Type resultType, |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 3); |
| auto successOrdering = mlir::LLVM::AtomicOrdering::acq_rel; |
| auto failureOrdering = mlir::LLVM::AtomicOrdering::monotonic; |
| auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(resultType.getContext()); |
| |
| mlir::Value arg0 = fir::getBase(args[0]); |
| mlir::Value arg1 = fir::getBase(args[1]); |
| mlir::Value arg2 = fir::getBase(args[2]); |
| |
| auto bitCastFloat = [&](mlir::Value arg) -> mlir::Value { |
| if (mlir::isa<mlir::Float32Type>(arg.getType())) |
| return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI32Type(), |
| arg); |
| if (mlir::isa<mlir::Float64Type>(arg.getType())) |
| return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI64Type(), |
| arg); |
| return arg; |
| }; |
| |
| arg1 = bitCastFloat(arg1); |
| arg2 = bitCastFloat(arg2); |
| |
| if (arg1.getType() != arg2.getType()) { |
| // arg1 and arg2 need to have the same type in AtomicCmpXchgOp. |
| arg2 = builder.createConvert(loc, arg1.getType(), arg2); |
| } |
| |
| auto address = |
| mlir::UnrealizedConversionCastOp::create(builder, loc, llvmPtrTy, arg0) |
| .getResult(0); |
| auto cmpxchg = mlir::LLVM::AtomicCmpXchgOp::create( |
| builder, loc, address, arg1, arg2, successOrdering, failureOrdering); |
| mlir::Value boolResult = |
| mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1); |
| return builder.createConvert(loc, resultType, boolResult); |
| } |
| |
| mlir::Value |
| CUDAIntrinsicLibrary::genAtomicDec(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| assert(args.size() == 2); |
| assert(mlir::isa<mlir::IntegerType>(args[1].getType())); |
| |
| mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::udec_wrap; |
| return genAtomBinOp(builder, loc, binOp, args[0], args[1]); |
| } |
| |
| // ATOMICEXCH |
| fir::ExtendedValue |
| CUDAIntrinsicLibrary::genAtomicExch(mlir::Type resultType, |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 2); |
| mlir::Value arg0 = fir::getBase(args[0]); |
| mlir::Value arg1 = fir::getBase(args[1]); |
| assert(arg1.getType().isIntOrFloat()); |
| |
| mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::xchg; |
| return genAtomBinOp(builder, loc, binOp, arg0, arg1); |
| } |
| |
| mlir::Value |
| CUDAIntrinsicLibrary::genAtomicInc(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| assert(args.size() == 2); |
| assert(mlir::isa<mlir::IntegerType>(args[1].getType())); |
| |
| mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::uinc_wrap; |
| return genAtomBinOp(builder, loc, binOp, args[0], args[1]); |
| } |
| |
| mlir::Value |
| CUDAIntrinsicLibrary::genAtomicMax(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| assert(args.size() == 2); |
| |
| mlir::LLVM::AtomicBinOp binOp = |
| mlir::isa<mlir::IntegerType>(args[1].getType()) |
| ? mlir::LLVM::AtomicBinOp::max |
| : mlir::LLVM::AtomicBinOp::fmax; |
| return genAtomBinOp(builder, loc, binOp, args[0], args[1]); |
| } |
| |
| mlir::Value |
| CUDAIntrinsicLibrary::genAtomicMin(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| assert(args.size() == 2); |
| |
| mlir::LLVM::AtomicBinOp binOp = |
| mlir::isa<mlir::IntegerType>(args[1].getType()) |
| ? mlir::LLVM::AtomicBinOp::min |
| : mlir::LLVM::AtomicBinOp::fmin; |
| return genAtomBinOp(builder, loc, binOp, args[0], args[1]); |
| } |
| |
| // ATOMICSUB |
| mlir::Value |
| CUDAIntrinsicLibrary::genAtomicSub(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| assert(args.size() == 2); |
| mlir::LLVM::AtomicBinOp binOp = |
| mlir::isa<mlir::IntegerType>(args[1].getType()) |
| ? mlir::LLVM::AtomicBinOp::sub |
| : mlir::LLVM::AtomicBinOp::fsub; |
| return genAtomBinOp(builder, loc, binOp, args[0], args[1]); |
| } |
| |
| // ATOMICXOR |
| fir::ExtendedValue |
| CUDAIntrinsicLibrary::genAtomicXor(mlir::Type resultType, |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 2); |
| mlir::Value arg0 = fir::getBase(args[0]); |
| mlir::Value arg1 = fir::getBase(args[1]); |
| return genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::_xor, arg0, arg1); |
| } |
| |
| // BARRIER_ARRIVE |
| mlir::Value |
| CUDAIntrinsicLibrary::genBarrierArrive(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| assert(args.size() == 1); |
| mlir::Value barrier = convertPtrToNVVMSpace( |
| builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); |
| return mlir::NVVM::MBarrierArriveOp::create(builder, loc, resultType, barrier) |
| .getResult(); |
| } |
| |
| // BARRIER_ARRIBVE_CNT |
| mlir::Value |
| CUDAIntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| assert(args.size() == 2); |
| mlir::Value barrier = convertPtrToNVVMSpace( |
| builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); |
| return mlir::NVVM::InlinePtxOp::create(builder, loc, {resultType}, |
| {barrier, args[1]}, {}, |
| "mbarrier.arrive.expect_tx.release." |
| "cta.shared::cta.b64 %0, [%1], %2;", |
| {}) |
| .getResult(0); |
| } |
| |
| // BARRIER_INIT |
| void CUDAIntrinsicLibrary::genBarrierInit( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 2); |
| mlir::Value barrier = convertPtrToNVVMSpace( |
| builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); |
| mlir::NVVM::MBarrierInitOp::create(builder, loc, barrier, |
| fir::getBase(args[1]), {}); |
| auto kind = mlir::NVVM::ProxyKindAttr::get( |
| builder.getContext(), mlir::NVVM::ProxyKind::async_shared); |
| auto space = mlir::NVVM::SharedSpaceAttr::get( |
| builder.getContext(), mlir::NVVM::SharedSpace::shared_cta); |
| mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space); |
| } |
| |
| // BARRIER_TRY_WAIT |
| mlir::Value |
| CUDAIntrinsicLibrary::genBarrierTryWait(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| assert(args.size() == 2); |
| mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); |
| mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0); |
| fir::StoreOp::create(builder, loc, zero, res); |
| mlir::Value ns = |
| builder.createIntegerConstant(loc, builder.getI32Type(), 1000000); |
| mlir::Value load = fir::LoadOp::create(builder, loc, res); |
| auto whileOp = mlir::scf::WhileOp::create( |
| builder, loc, mlir::TypeRange{resultType}, mlir::ValueRange{load}); |
| mlir::Block *beforeBlock = builder.createBlock(&whileOp.getBefore()); |
| mlir::Value beforeArg = beforeBlock->addArgument(resultType, loc); |
| builder.setInsertionPointToStart(beforeBlock); |
| mlir::Value condition = mlir::arith::CmpIOp::create( |
| builder, loc, mlir::arith::CmpIPredicate::ne, beforeArg, zero); |
| mlir::scf::ConditionOp::create(builder, loc, condition, beforeArg); |
| mlir::Block *afterBlock = builder.createBlock(&whileOp.getAfter()); |
| afterBlock->addArgument(resultType, loc); |
| builder.setInsertionPointToStart(afterBlock); |
| auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); |
| auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); |
| mlir::Value ret = mlir::NVVM::InlinePtxOp::create( |
| builder, loc, {resultType}, {barrier, args[1], ns}, {}, |
| "{\n" |
| " .reg .pred p;\n" |
| " mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n" |
| " selp.b32 %0, 1, 0, p;\n" |
| "}", |
| {}) |
| .getResult(0); |
| mlir::scf::YieldOp::create(builder, loc, ret); |
| builder.setInsertionPointAfter(whileOp); |
| return whileOp.getResult(0); |
| } |
| |
| // BARRIER_TRY_WAIT_SLEEP |
| mlir::Value |
| CUDAIntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| assert(args.size() == 3); |
| auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); |
| auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); |
| return mlir::NVVM::InlinePtxOp::create( |
| builder, loc, {resultType}, {barrier, args[1], args[2]}, {}, |
| "{\n" |
| " .reg .pred p;\n" |
| " mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n" |
| " selp.b32 %0, 1, 0, p;\n" |
| "}", |
| {}) |
| .getResult(0); |
| } |
| |
| // FENCE_PROXY_ASYNC |
| void CUDAIntrinsicLibrary::genFenceProxyAsync( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 0); |
| auto kind = mlir::NVVM::ProxyKindAttr::get( |
| builder.getContext(), mlir::NVVM::ProxyKind::async_shared); |
| auto space = mlir::NVVM::SharedSpaceAttr::get( |
| builder.getContext(), mlir::NVVM::SharedSpace::shared_cta); |
| mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space); |
| } |
| |
| // __LDCA, __LDCS, __LDLU, __LDCV |
| template <const char *fctName, int extent> |
| fir::ExtendedValue |
| CUDAIntrinsicLibrary::genLDXXFunc(mlir::Type resultType, |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 1); |
| mlir::Type resTy = fir::SequenceType::get(extent, resultType); |
| mlir::Value arg = fir::getBase(args[0]); |
| mlir::Value res = fir::AllocaOp::create(builder, loc, resTy); |
| if (mlir::isa<fir::BaseBoxType>(arg.getType())) |
| arg = fir::BoxAddrOp::create(builder, loc, arg); |
| mlir::Type refResTy = fir::ReferenceType::get(resTy); |
| mlir::FunctionType ftype = |
| mlir::FunctionType::get(arg.getContext(), {refResTy, refResTy}, {}); |
| auto funcOp = builder.createFunction(loc, fctName, ftype); |
| llvm::SmallVector<mlir::Value> funcArgs; |
| funcArgs.push_back(res); |
| funcArgs.push_back(arg); |
| fir::CallOp::create(builder, loc, funcOp, funcArgs); |
| mlir::Value ext = |
| builder.createIntegerConstant(loc, builder.getIndexType(), extent); |
| return fir::ArrayBoxValue(res, {ext}); |
| } |
| |
| // CLOCK, CLOCK64, GLOBALTIMER |
| template <typename OpTy> |
| mlir::Value |
| CUDAIntrinsicLibrary::genNVVMTime(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| assert(args.size() == 0 && "expect no arguments"); |
| return OpTy::create(builder, loc, resultType).getResult(); |
| } |
| |
| // MATCH_ALL_SYNC |
| mlir::Value |
| CUDAIntrinsicLibrary::genMatchAllSync(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| assert(args.size() == 3); |
| bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32(); |
| |
| mlir::Type i1Ty = builder.getI1Type(); |
| mlir::MLIRContext *context = builder.getContext(); |
| |
| mlir::Value arg1 = args[1]; |
| if (arg1.getType().isF32() || arg1.getType().isF64()) |
| arg1 = fir::ConvertOp::create( |
| builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1); |
| |
| mlir::Type retTy = |
| mlir::LLVM::LLVMStructType::getLiteral(context, {resultType, i1Ty}); |
| auto match = |
| mlir::NVVM::MatchSyncOp::create(builder, loc, retTy, args[0], arg1, |
| mlir::NVVM::MatchSyncKind::all) |
| .getResult(); |
| auto value = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 0); |
| auto pred = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 1); |
| auto conv = mlir::LLVM::ZExtOp::create(builder, loc, resultType, pred); |
| fir::StoreOp::create(builder, loc, conv, args[2]); |
| return value; |
| } |
| |
| // MATCH_ANY_SYNC |
| mlir::Value |
| CUDAIntrinsicLibrary::genMatchAnySync(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| assert(args.size() == 2); |
| bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32(); |
| |
| mlir::Value arg1 = args[1]; |
| if (arg1.getType().isF32() || arg1.getType().isF64()) |
| arg1 = fir::ConvertOp::create( |
| builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1); |
| |
| return mlir::NVVM::MatchSyncOp::create(builder, loc, resultType, args[0], |
| arg1, mlir::NVVM::MatchSyncKind::any) |
| .getResult(); |
| } |
| |
| // SYNCTHREADS |
| void CUDAIntrinsicLibrary::genSyncThreads( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| mlir::NVVM::Barrier0Op::create(builder, loc); |
| } |
| |
| // SYNCTHREADS_AND |
| mlir::Value |
| CUDAIntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.and"; |
| mlir::MLIRContext *context = builder.getContext(); |
| mlir::Type i32 = builder.getI32Type(); |
| mlir::FunctionType ftype = |
| mlir::FunctionType::get(context, {resultType}, {i32}); |
| auto funcOp = builder.createFunction(loc, funcName, ftype); |
| mlir::Value arg = builder.createConvert(loc, i32, args[0]); |
| return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); |
| } |
| |
| // SYNCTHREADS_COUNT |
| mlir::Value |
| CUDAIntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.popc"; |
| mlir::MLIRContext *context = builder.getContext(); |
| mlir::Type i32 = builder.getI32Type(); |
| mlir::FunctionType ftype = |
| mlir::FunctionType::get(context, {resultType}, {i32}); |
| auto funcOp = builder.createFunction(loc, funcName, ftype); |
| mlir::Value arg = builder.createConvert(loc, i32, args[0]); |
| return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); |
| } |
| |
| // SYNCTHREADS_OR |
| mlir::Value |
| CUDAIntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.or"; |
| mlir::MLIRContext *context = builder.getContext(); |
| mlir::Type i32 = builder.getI32Type(); |
| mlir::FunctionType ftype = |
| mlir::FunctionType::get(context, {resultType}, {i32}); |
| auto funcOp = builder.createFunction(loc, funcName, ftype); |
| mlir::Value arg = builder.createConvert(loc, i32, args[0]); |
| return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); |
| } |
| |
| // SYNCWARP |
| void CUDAIntrinsicLibrary::genSyncWarp( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 1); |
| constexpr llvm::StringLiteral funcName = "llvm.nvvm.bar.warp.sync"; |
| mlir::Value mask = fir::getBase(args[0]); |
| mlir::FunctionType funcType = |
| mlir::FunctionType::get(builder.getContext(), {mask.getType()}, {}); |
| auto funcOp = builder.createFunction(loc, funcName, funcType); |
| llvm::SmallVector<mlir::Value> argsList{mask}; |
| fir::CallOp::create(builder, loc, funcOp, argsList); |
| } |
| |
| // THIS_GRID |
| mlir::Value |
| CUDAIntrinsicLibrary::genThisGrid(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| assert(args.size() == 0); |
| auto recTy = mlir::cast<fir::RecordType>(resultType); |
| assert(recTy && "RecordType expepected"); |
| mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); |
| mlir::Type i32Ty = builder.getI32Type(); |
| |
| mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); |
| mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty); |
| mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty); |
| |
| mlir::Value blockIdX = mlir::NVVM::BlockIdXOp::create(builder, loc, i32Ty); |
| mlir::Value blockIdY = mlir::NVVM::BlockIdYOp::create(builder, loc, i32Ty); |
| mlir::Value blockIdZ = mlir::NVVM::BlockIdZOp::create(builder, loc, i32Ty); |
| |
| mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty); |
| mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty); |
| mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty); |
| mlir::Value gridDimX = mlir::NVVM::GridDimXOp::create(builder, loc, i32Ty); |
| mlir::Value gridDimY = mlir::NVVM::GridDimYOp::create(builder, loc, i32Ty); |
| mlir::Value gridDimZ = mlir::NVVM::GridDimZOp::create(builder, loc, i32Ty); |
| |
| // this_grid.size = ((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y)) * |
| // (blockDim.x * gridDim.x); |
| mlir::Value resZ = |
| mlir::arith::MulIOp::create(builder, loc, blockDimZ, gridDimZ); |
| mlir::Value resY = |
| mlir::arith::MulIOp::create(builder, loc, blockDimY, gridDimY); |
| mlir::Value resX = |
| mlir::arith::MulIOp::create(builder, loc, blockDimX, gridDimX); |
| mlir::Value resZY = mlir::arith::MulIOp::create(builder, loc, resZ, resY); |
| mlir::Value size = mlir::arith::MulIOp::create(builder, loc, resZY, resX); |
| |
| // tmp = ((blockIdx.z * gridDim.y * gridDim.x) + (blockIdx.y * gridDim.x)) + |
| // blockIdx.x; |
| // this_group.rank = tmp * ((blockDim.x * blockDim.y) * blockDim.z) + |
| // ((threadIdx.z * blockDim.y) * blockDim.x) + |
| // (threadIdx.y * blockDim.x) + threadIdx.x + 1; |
| mlir::Value r1 = |
| mlir::arith::MulIOp::create(builder, loc, blockIdZ, gridDimY); |
| mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, gridDimX); |
| mlir::Value r3 = |
| mlir::arith::MulIOp::create(builder, loc, blockIdY, gridDimX); |
| mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3); |
| mlir::Value tmp = mlir::arith::AddIOp::create(builder, loc, r2r3, blockIdX); |
| |
| mlir::Value bXbY = |
| mlir::arith::MulIOp::create(builder, loc, blockDimX, blockDimY); |
| mlir::Value bXbYbZ = |
| mlir::arith::MulIOp::create(builder, loc, bXbY, blockDimZ); |
| mlir::Value tZbY = |
| mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY); |
| mlir::Value tZbYbX = |
| mlir::arith::MulIOp::create(builder, loc, tZbY, blockDimX); |
| mlir::Value tYbX = |
| mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX); |
| mlir::Value rank = mlir::arith::MulIOp::create(builder, loc, tmp, bXbYbZ); |
| rank = mlir::arith::AddIOp::create(builder, loc, rank, tZbYbX); |
| rank = mlir::arith::AddIOp::create(builder, loc, rank, tYbX); |
| rank = mlir::arith::AddIOp::create(builder, loc, rank, threadIdX); |
| mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); |
| rank = mlir::arith::AddIOp::create(builder, loc, rank, one); |
| |
| auto sizeFieldName = recTy.getTypeList()[1].first; |
| mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; |
| mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); |
| mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( |
| builder, loc, fieldIndexType, sizeFieldName, recTy, |
| /*typeParams=*/mlir::ValueRange{}); |
| mlir::Value sizeCoord = fir::CoordinateOp::create( |
| builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); |
| fir::StoreOp::create(builder, loc, size, sizeCoord); |
| |
| auto rankFieldName = recTy.getTypeList()[2].first; |
| mlir::Type rankFieldTy = recTy.getTypeList()[2].second; |
| mlir::Value rankFieldIndex = fir::FieldIndexOp::create( |
| builder, loc, fieldIndexType, rankFieldName, recTy, |
| /*typeParams=*/mlir::ValueRange{}); |
| mlir::Value rankCoord = fir::CoordinateOp::create( |
| builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); |
| fir::StoreOp::create(builder, loc, rank, rankCoord); |
| return res; |
| } |
| |
| // THIS_THREAD_BLOCK |
| mlir::Value |
| CUDAIntrinsicLibrary::genThisThreadBlock(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| assert(args.size() == 0); |
| auto recTy = mlir::cast<fir::RecordType>(resultType); |
| assert(recTy && "RecordType expepected"); |
| mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); |
| mlir::Type i32Ty = builder.getI32Type(); |
| |
| // this_thread_block%size = blockDim.z * blockDim.y * blockDim.x; |
| mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty); |
| mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty); |
| mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty); |
| mlir::Value size = |
| mlir::arith::MulIOp::create(builder, loc, blockDimZ, blockDimY); |
| size = mlir::arith::MulIOp::create(builder, loc, size, blockDimX); |
| |
| // this_thread_block%rank = ((threadIdx.z * blockDim.y) * blockDim.x) + |
| // (threadIdx.y * blockDim.x) + threadIdx.x + 1; |
| mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); |
| mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty); |
| mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty); |
| mlir::Value r1 = |
| mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY); |
| mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, blockDimX); |
| mlir::Value r3 = |
| mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX); |
| mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3); |
| mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, r2r3, threadIdX); |
| mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); |
| rank = mlir::arith::AddIOp::create(builder, loc, rank, one); |
| |
| auto sizeFieldName = recTy.getTypeList()[1].first; |
| mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; |
| mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); |
| mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( |
| builder, loc, fieldIndexType, sizeFieldName, recTy, |
| /*typeParams=*/mlir::ValueRange{}); |
| mlir::Value sizeCoord = fir::CoordinateOp::create( |
| builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); |
| fir::StoreOp::create(builder, loc, size, sizeCoord); |
| |
| auto rankFieldName = recTy.getTypeList()[2].first; |
| mlir::Type rankFieldTy = recTy.getTypeList()[2].second; |
| mlir::Value rankFieldIndex = fir::FieldIndexOp::create( |
| builder, loc, fieldIndexType, rankFieldName, recTy, |
| /*typeParams=*/mlir::ValueRange{}); |
| mlir::Value rankCoord = fir::CoordinateOp::create( |
| builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); |
| fir::StoreOp::create(builder, loc, rank, rankCoord); |
| return res; |
| } |
| |
| // THIS_WARP |
| mlir::Value |
| CUDAIntrinsicLibrary::genThisWarp(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| assert(args.size() == 0); |
| auto recTy = mlir::cast<fir::RecordType>(resultType); |
| assert(recTy && "RecordType expepected"); |
| mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); |
| mlir::Type i32Ty = builder.getI32Type(); |
| |
| // coalesced_group%size = 32 |
| mlir::Value size = builder.createIntegerConstant(loc, i32Ty, 32); |
| auto sizeFieldName = recTy.getTypeList()[1].first; |
| mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; |
| mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); |
| mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( |
| builder, loc, fieldIndexType, sizeFieldName, recTy, |
| /*typeParams=*/mlir::ValueRange{}); |
| mlir::Value sizeCoord = fir::CoordinateOp::create( |
| builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); |
| fir::StoreOp::create(builder, loc, size, sizeCoord); |
| |
| // coalesced_group%rank = threadIdx.x & 31 + 1 |
| mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); |
| mlir::Value mask = builder.createIntegerConstant(loc, i32Ty, 31); |
| mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); |
| mlir::Value masked = |
| mlir::arith::AndIOp::create(builder, loc, threadIdX, mask); |
| mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, masked, one); |
| auto rankFieldName = recTy.getTypeList()[2].first; |
| mlir::Type rankFieldTy = recTy.getTypeList()[2].second; |
| mlir::Value rankFieldIndex = fir::FieldIndexOp::create( |
| builder, loc, fieldIndexType, rankFieldName, recTy, |
| /*typeParams=*/mlir::ValueRange{}); |
| mlir::Value rankCoord = fir::CoordinateOp::create( |
| builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); |
| fir::StoreOp::create(builder, loc, rank, rankCoord); |
| return res; |
| } |
| |
| // THREADFENCE |
| void CUDAIntrinsicLibrary::genThreadFence( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.gl"; |
| mlir::FunctionType funcType = |
| mlir::FunctionType::get(builder.getContext(), {}, {}); |
| auto funcOp = builder.createFunction(loc, funcName, funcType); |
| llvm::SmallVector<mlir::Value> noArgs; |
| fir::CallOp::create(builder, loc, funcOp, noArgs); |
| } |
| |
| // THREADFENCE_BLOCK |
| void CUDAIntrinsicLibrary::genThreadFenceBlock( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.cta"; |
| mlir::FunctionType funcType = |
| mlir::FunctionType::get(builder.getContext(), {}, {}); |
| auto funcOp = builder.createFunction(loc, funcName, funcType); |
| llvm::SmallVector<mlir::Value> noArgs; |
| fir::CallOp::create(builder, loc, funcOp, noArgs); |
| } |
| |
| // THREADFENCE_SYSTEM |
| void CUDAIntrinsicLibrary::genThreadFenceSystem( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.sys"; |
| mlir::FunctionType funcType = |
| mlir::FunctionType::get(builder.getContext(), {}, {}); |
| auto funcOp = builder.createFunction(loc, funcName, funcType); |
| llvm::SmallVector<mlir::Value> noArgs; |
| fir::CallOp::create(builder, loc, funcOp, noArgs); |
| } |
| |
| // TMA_BULK_COMMIT_GROUP |
| void CUDAIntrinsicLibrary::genTMABulkCommitGroup( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 0); |
| mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc); |
| } |
| |
| // TMA_BULK_G2S |
| void CUDAIntrinsicLibrary::genTMABulkG2S( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 4); |
| mlir::Value barrier = convertPtrToNVVMSpace( |
| builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); |
| mlir::Value dst = |
| convertPtrToNVVMSpace(builder, loc, fir::getBase(args[2]), |
| mlir::NVVM::NVVMMemorySpace::SharedCluster); |
| mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]), |
| mlir::NVVM::NVVMMemorySpace::Global); |
| mlir::NVVM::CpAsyncBulkGlobalToSharedClusterOp::create( |
| builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {}); |
| } |
| |
| static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc, |
| mlir::Value barrier, mlir::Value src, |
| mlir::Value dst, mlir::Value nelem, |
| mlir::Value eleSize) { |
| mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize); |
| auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); |
| barrier = builder.createConvert(loc, llvmPtrTy, barrier); |
| dst = builder.createConvert(loc, llvmPtrTy, dst); |
| src = builder.createConvert(loc, llvmPtrTy, src); |
| mlir::NVVM::InlinePtxOp::create( |
| builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {}, |
| "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], " |
| "[%1], %2, [%3];", |
| {}); |
| mlir::NVVM::InlinePtxOp::create( |
| builder, loc, mlir::TypeRange{}, {barrier, size}, {}, |
| "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {}); |
| } |
| |
| // TMA_BULK_LOADC4 |
| void CUDAIntrinsicLibrary::genTMABulkLoadC4( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 4); |
| mlir::Value eleSize = |
| builder.createIntegerConstant(loc, builder.getI32Type(), 8); |
| genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), |
| fir::getBase(args[2]), fir::getBase(args[3]), eleSize); |
| } |
| |
| // TMA_BULK_LOADC8 |
| void CUDAIntrinsicLibrary::genTMABulkLoadC8( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 4); |
| mlir::Value eleSize = |
| builder.createIntegerConstant(loc, builder.getI32Type(), 16); |
| genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), |
| fir::getBase(args[2]), fir::getBase(args[3]), eleSize); |
| } |
| |
| // TMA_BULK_LOADI4 |
| void CUDAIntrinsicLibrary::genTMABulkLoadI4( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 4); |
| mlir::Value eleSize = |
| builder.createIntegerConstant(loc, builder.getI32Type(), 4); |
| genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), |
| fir::getBase(args[2]), fir::getBase(args[3]), eleSize); |
| } |
| |
| // TMA_BULK_LOADI8 |
| void CUDAIntrinsicLibrary::genTMABulkLoadI8( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 4); |
| mlir::Value eleSize = |
| builder.createIntegerConstant(loc, builder.getI32Type(), 8); |
| genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), |
| fir::getBase(args[2]), fir::getBase(args[3]), eleSize); |
| } |
| |
| // TMA_BULK_LOADR2 |
| void CUDAIntrinsicLibrary::genTMABulkLoadR2( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 4); |
| mlir::Value eleSize = |
| builder.createIntegerConstant(loc, builder.getI32Type(), 2); |
| genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), |
| fir::getBase(args[2]), fir::getBase(args[3]), eleSize); |
| } |
| |
| // TMA_BULK_LOADR4 |
| void CUDAIntrinsicLibrary::genTMABulkLoadR4( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 4); |
| mlir::Value eleSize = |
| builder.createIntegerConstant(loc, builder.getI32Type(), 4); |
| genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), |
| fir::getBase(args[2]), fir::getBase(args[3]), eleSize); |
| } |
| |
| // TMA_BULK_LOADR8 |
| void CUDAIntrinsicLibrary::genTMABulkLoadR8( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 4); |
| mlir::Value eleSize = |
| builder.createIntegerConstant(loc, builder.getI32Type(), 8); |
| genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), |
| fir::getBase(args[2]), fir::getBase(args[3]), eleSize); |
| } |
| |
| // TMA_BULK_S2G |
| void CUDAIntrinsicLibrary::genTMABulkS2G( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 3); |
| mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[0]), |
| mlir::NVVM::NVVMMemorySpace::Shared); |
| mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]), |
| mlir::NVVM::NVVMMemorySpace::Global); |
| mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create( |
| builder, loc, dst, src, fir::getBase(args[2]), {}, {}); |
| |
| mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, |
| "cp.async.bulk.commit_group;", {}); |
| mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, |
| builder.getI32IntegerAttr(0), {}); |
| } |
| |
| static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc, |
| mlir::Value src, mlir::Value dst, mlir::Value count, |
| mlir::Value eleSize) { |
| mlir::Value size = mlir::arith::MulIOp::create(builder, loc, eleSize, count); |
| src = convertPtrToNVVMSpace(builder, loc, src, |
| mlir::NVVM::NVVMMemorySpace::Shared); |
| dst = convertPtrToNVVMSpace(builder, loc, dst, |
| mlir::NVVM::NVVMMemorySpace::Global); |
| mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src, |
| size, {}, {}); |
| mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, |
| "cp.async.bulk.commit_group;", {}); |
| mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, |
| builder.getI32IntegerAttr(0), {}); |
| } |
| |
| // TMA_BULK_STORE_C4 |
| void CUDAIntrinsicLibrary::genTMABulkStoreC4( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 3); |
| mlir::Value eleSize = |
| builder.createIntegerConstant(loc, builder.getI32Type(), 8); |
| genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), |
| fir::getBase(args[2]), eleSize); |
| } |
| |
| // TMA_BULK_STORE_C8 |
| void CUDAIntrinsicLibrary::genTMABulkStoreC8( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 3); |
| mlir::Value eleSize = |
| builder.createIntegerConstant(loc, builder.getI32Type(), 16); |
| genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), |
| fir::getBase(args[2]), eleSize); |
| } |
| |
| // TMA_BULK_STORE_I4 |
| void CUDAIntrinsicLibrary::genTMABulkStoreI4( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 3); |
| mlir::Value eleSize = |
| builder.createIntegerConstant(loc, builder.getI32Type(), 4); |
| genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), |
| fir::getBase(args[2]), eleSize); |
| } |
| |
| // TMA_BULK_STORE_I8 |
| void CUDAIntrinsicLibrary::genTMABulkStoreI8( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 3); |
| mlir::Value eleSize = |
| builder.createIntegerConstant(loc, builder.getI32Type(), 8); |
| genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), |
| fir::getBase(args[2]), eleSize); |
| } |
| |
| // TMA_BULK_STORE_R2 |
| void CUDAIntrinsicLibrary::genTMABulkStoreR2( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 3); |
| mlir::Value eleSize = |
| builder.createIntegerConstant(loc, builder.getI32Type(), 2); |
| genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), |
| fir::getBase(args[2]), eleSize); |
| } |
| |
| // TMA_BULK_STORE_R4 |
| void CUDAIntrinsicLibrary::genTMABulkStoreR4( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 3); |
| mlir::Value eleSize = |
| builder.createIntegerConstant(loc, builder.getI32Type(), 4); |
| genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), |
| fir::getBase(args[2]), eleSize); |
| } |
| |
| // TMA_BULK_STORE_R8 |
| void CUDAIntrinsicLibrary::genTMABulkStoreR8( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 3); |
| mlir::Value eleSize = |
| builder.createIntegerConstant(loc, builder.getI32Type(), 8); |
| genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), |
| fir::getBase(args[2]), eleSize); |
| } |
| |
| // TMA_BULK_WAIT_GROUP |
| void CUDAIntrinsicLibrary::genTMABulkWaitGroup( |
| llvm::ArrayRef<fir::ExtendedValue> args) { |
| assert(args.size() == 0); |
| auto group = builder.getIntegerAttr(builder.getI32Type(), 0); |
| mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, group, {}); |
| } |
| |
| // ALL_SYNC, ANY_SYNC, BALLOT_SYNC |
| template <mlir::NVVM::VoteSyncKind kind> |
| mlir::Value |
| CUDAIntrinsicLibrary::genVoteSync(mlir::Type resultType, |
| llvm::ArrayRef<mlir::Value> args) { |
| assert(args.size() == 2); |
| mlir::Value arg1 = |
| fir::ConvertOp::create(builder, loc, builder.getI1Type(), args[1]); |
| mlir::Type resTy = kind == mlir::NVVM::VoteSyncKind::ballot |
| ? builder.getI32Type() |
| : builder.getI1Type(); |
| auto voteRes = |
| mlir::NVVM::VoteSyncOp::create(builder, loc, resTy, args[0], arg1, kind) |
| .getResult(); |
| return fir::ConvertOp::create(builder, loc, resultType, voteRes); |
| } |
| |
| } // namespace fir |