| //===- LowerGpuOpsToROCDLOps.cpp - MLIR GPU to ROCDL lowering passes ------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file implements a pass to generate ROCDLIR operations for higher-level |
| // GPU operations. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h" |
| #include "mlir/Dialect/Arith/Transforms/Passes.h" |
| #include "mlir/Pass/Pass.h" |
| #include "mlir/Pass/PassManager.h" |
| #include "mlir/Transforms/Passes.h" |
| |
| #include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h" |
| #include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h" |
| #include "mlir/Conversion/ConvertToLLVM/ToLLVMPass.h" |
| #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" |
| #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" |
| #include "mlir/Conversion/LLVMCommon/LoweringOptions.h" |
| #include "mlir/Conversion/LLVMCommon/Pattern.h" |
| #include "mlir/Conversion/LLVMCommon/TypeConverter.h" |
| #include "mlir/Conversion/MathToLLVM/MathToLLVM.h" |
| #include "mlir/Conversion/MathToROCDL/MathToROCDL.h" |
| #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h" |
| #include "mlir/Dialect/Func/IR/FuncOps.h" |
| #include "mlir/Dialect/GPU/IR/GPUDialect.h" |
| #include "mlir/Dialect/GPU/Transforms/Passes.h" |
| #include "mlir/Dialect/LLVMIR/LLVMDialect.h" |
| #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" |
| #include "mlir/Dialect/Math/IR/Math.h" |
| #include "mlir/Dialect/MemRef/IR/MemRef.h" |
| #include "mlir/Dialect/Vector/IR/VectorOps.h" |
| #include "mlir/IR/BuiltinAttributes.h" |
| #include "mlir/Pass/Pass.h" |
| #include "mlir/Transforms/DialectConversion.h" |
| #include "mlir/Transforms/GreedyPatternRewriteDriver.h" |
| #include "llvm/Support/FormatVariadic.h" |
| |
| #include "../GPUCommon/GPUOpsLowering.h" |
| #include "../GPUCommon/IndexIntrinsicsOpLowering.h" |
| |
| namespace mlir { |
| #define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS |
| #include "mlir/Conversion/Passes.h.inc" |
| } // namespace mlir |
| |
| using namespace mlir; |
| |
| // Truncate or extend the result depending on the index bitwidth specified |
| // by the LLVMTypeConverter options. |
| static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter, |
| Location loc, Value value, |
| const LLVMTypeConverter &converter) { |
| int64_t intWidth = cast<IntegerType>(value.getType()).getWidth(); |
| int64_t indexBitwidth = converter.getIndexTypeBitwidth(); |
| auto indexBitwidthType = |
| IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth()); |
| // TODO: use <=> in C++20. |
| if (indexBitwidth > intWidth) { |
| return rewriter.create<LLVM::SExtOp>(loc, indexBitwidthType, value); |
| } |
| if (indexBitwidth < intWidth) { |
| return rewriter.create<LLVM::TruncOp>(loc, indexBitwidthType, value); |
| } |
| return value; |
| } |
| |
| /// Returns true if the given `gpu.func` can be safely called using the bare |
| /// pointer calling convention. |
| static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) { |
| bool canBeBare = true; |
| for (Type type : func.getArgumentTypes()) |
| if (auto memrefTy = dyn_cast<BaseMemRefType>(type)) |
| canBeBare &= LLVMTypeConverter::canConvertToBarePtr(memrefTy); |
| return canBeBare; |
| } |
| |
| static Value getLaneId(ConversionPatternRewriter &rewriter, Location loc, |
| const unsigned indexBitwidth) { |
| auto int32Type = IntegerType::get(rewriter.getContext(), 32); |
| Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 32); |
| Value minus1 = rewriter.create<arith::ConstantIntOp>(loc, -1, 32); |
| Value mbcntLo = rewriter.create<ROCDL::MbcntLoOp>(loc, int32Type, |
| ValueRange{minus1, zero}); |
| Value laneId = rewriter.create<ROCDL::MbcntHiOp>(loc, int32Type, |
| ValueRange{minus1, mbcntLo}); |
| return laneId; |
| } |
| static constexpr StringLiteral amdgcnDataLayout = |
| "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" |
| "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:" |
| "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:" |
| "64-S32-A5-G1-ni:7:8:9"; |
| |
| namespace { |
| struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> { |
| using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern; |
| |
| LogicalResult |
| matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor, |
| ConversionPatternRewriter &rewriter) const override { |
| auto loc = op->getLoc(); |
| MLIRContext *context = rewriter.getContext(); |
| // convert to: %mlo = call @llvm.amdgcn.mbcnt.lo(-1, 0) |
| // followed by: %lid = call @llvm.amdgcn.mbcnt.hi(-1, %mlo) |
| |
| Type intTy = IntegerType::get(context, 32); |
| Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 32); |
| Value minus1 = rewriter.create<arith::ConstantIntOp>(loc, -1, 32); |
| Value mbcntLo = |
| rewriter.create<ROCDL::MbcntLoOp>(loc, intTy, ValueRange{minus1, zero}); |
| Value laneId = rewriter.create<ROCDL::MbcntHiOp>( |
| loc, intTy, ValueRange{minus1, mbcntLo}); |
| // Truncate or extend the result depending on the index bitwidth specified |
| // by the LLVMTypeConverter options. |
| const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth(); |
| if (indexBitwidth > 32) { |
| laneId = rewriter.create<LLVM::SExtOp>( |
| loc, IntegerType::get(context, indexBitwidth), laneId); |
| } else if (indexBitwidth < 32) { |
| laneId = rewriter.create<LLVM::TruncOp>( |
| loc, IntegerType::get(context, indexBitwidth), laneId); |
| } |
| rewriter.replaceOp(op, {laneId}); |
| return success(); |
| } |
| }; |
| |
| struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> { |
| using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; |
| |
| GPUSubgroupSizeOpToROCDL(const LLVMTypeConverter &converter, |
| amdgpu::Chipset chipset) |
| : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp>(converter), |
| chipset(chipset) {} |
| |
| LogicalResult |
| matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor, |
| ConversionPatternRewriter &rewriter) const override { |
| LLVM::ConstantRangeAttr bounds = nullptr; |
| bool isBeforeGfx10 = chipset.majorVersion < 10; |
| if (auto upperBoundAttr = op.getUpperBoundAttr()) { |
| bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>( |
| /*bitWidth=*/32, /*lower=*/isBeforeGfx10 ? 64 : 32, |
| /*upper=*/op.getUpperBoundAttr().getInt() + 1); |
| } |
| Value wavefrontOp = rewriter.create<ROCDL::WavefrontSizeOp>( |
| op.getLoc(), rewriter.getI32Type(), bounds); |
| wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp, |
| *getTypeConverter()); |
| rewriter.replaceOp(op, {wavefrontOp}); |
| return success(); |
| } |
| |
| const amdgpu::Chipset chipset; |
| }; |
| |
| struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> { |
| using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern; |
| |
| /// Lowers a shuffle to the corresponding ROCDL ops. |
| /// |
| /// Use the `width` argument to see if src lane is participating. |
| /// If not the dstLane would be itself. |
| /// |
| /// Shuffle with DS Bpermute: |
| /// let shflMode = [xor, up, down, idx] |
| /// let width = 32(usually warpsize), step = [1, 2, 4, 8, 16, ... , width]. |
| /// 1. curLaneId = using mbcnt.lo + mbcnt.hi |
| /// 2. widthOrZeroIfOutside = (curLaneId + width) & -width |
| /// 3. dstLane = shflMode(curLaneId, step) |
| /// 4. isActiveSrcLane = dstLane < isActiveSrcLane |
| /// 5. dstLane = isActiveSrcLane ? dstLane : curLaneId |
| /// 6. dwordAlignedDstLane = dstLane * 4 or dstLane << 2. |
| /// 7. bpermute(dwordAlignedDstLane, shfl_value). |
| /// |
| LogicalResult |
| matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor, |
| ConversionPatternRewriter &rewriter) const override { |
| Location loc = op->getLoc(); |
| Value initShflValue = adaptor.getValue(); |
| |
| const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth(); |
| Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth); |
| |
| auto int32Type = IntegerType::get(rewriter.getContext(), 32); |
| Value width = adaptor.getWidth(); |
| Value zero = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 0); |
| Value negwidth = rewriter.create<LLVM::SubOp>(loc, int32Type, zero, width); |
| Value add = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, width); |
| Value widthOrZeroIfOutside = |
| rewriter.create<LLVM::AndOp>(loc, int32Type, add, negwidth); |
| Value dstLane; |
| // TODO: Use ds_swizzle for XOR when step/offsets are constants for better |
| // perf. |
| switch (op.getMode()) { |
| case gpu::ShuffleMode::UP: |
| dstLane = rewriter.create<LLVM::SubOp>(loc, int32Type, srcLaneId, |
| adaptor.getOffset()); |
| break; |
| case gpu::ShuffleMode::DOWN: |
| dstLane = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, |
| adaptor.getOffset()); |
| break; |
| case gpu::ShuffleMode::XOR: |
| dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId, |
| adaptor.getOffset()); |
| break; |
| case gpu::ShuffleMode::IDX: |
| dstLane = adaptor.getOffset(); |
| break; |
| } |
| Value isActiveSrcLane = rewriter.create<LLVM::ICmpOp>( |
| loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside); |
| Value selectDstLane = rewriter.create<LLVM::SelectOp>(loc, isActiveSrcLane, |
| dstLane, srcLaneId); |
| Value two = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 2); |
| Value dwordAlignedDstLane = |
| rewriter.create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two); |
| |
| SmallVector<Value> decomposed = |
| LLVM::decomposeValue(rewriter, loc, initShflValue, int32Type); |
| SmallVector<Value> swizzled; |
| for (Value v : decomposed) { |
| Value res = rewriter.create<ROCDL::DsBpermuteOp>(loc, int32Type, |
| dwordAlignedDstLane, v); |
| swizzled.emplace_back(res); |
| } |
| Value shflValue = |
| LLVM::composeValue(rewriter, loc, swizzled, initShflValue.getType()); |
| rewriter.replaceOp(op, {shflValue, isActiveSrcLane}); |
| return success(); |
| } |
| }; |
| |
| /// Import the GPU Ops to ROCDL Patterns. |
| #include "GPUToROCDL.cpp.inc" |
| |
| // A pass that replaces all occurrences of GPU device operations with their |
| // corresponding ROCDL equivalent. |
| // |
| // This pass only handles device code and is not meant to be run on GPU host |
| // code. |
| struct LowerGpuOpsToROCDLOpsPass final |
| : public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> { |
| LowerGpuOpsToROCDLOpsPass() = default; |
| LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth, |
| bool useBarePtrCallConv, |
| gpu::amd::Runtime runtime) { |
| if (this->chipset.getNumOccurrences() == 0) |
| this->chipset = chipset; |
| if (this->indexBitwidth.getNumOccurrences() == 0) |
| this->indexBitwidth = indexBitwidth; |
| if (this->useBarePtrCallConv.getNumOccurrences() == 0) |
| this->useBarePtrCallConv = useBarePtrCallConv; |
| if (this->runtime.getNumOccurrences() == 0) |
| this->runtime = runtime; |
| } |
| |
| void getDependentDialects(DialectRegistry ®istry) const override { |
| Base::getDependentDialects(registry); |
| registerConvertToLLVMDependentDialectLoading(registry); |
| } |
| |
| void runOnOperation() override { |
| gpu::GPUModuleOp m = getOperation(); |
| MLIRContext *ctx = m.getContext(); |
| |
| auto llvmDataLayout = m->getAttrOfType<StringAttr>( |
| LLVM::LLVMDialect::getDataLayoutAttrName()); |
| if (!llvmDataLayout) { |
| llvmDataLayout = StringAttr::get(ctx, amdgcnDataLayout); |
| m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout); |
| } |
| // Request C wrapper emission. |
| for (auto func : m.getOps<func::FuncOp>()) { |
| func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(), |
| UnitAttr::get(ctx)); |
| } |
| |
| FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset); |
| if (failed(maybeChipset)) { |
| emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset); |
| return signalPassFailure(); |
| } |
| |
| /// Customize the bitwidth used for the device side index computations. |
| LowerToLLVMOptions options( |
| ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation()))); |
| options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue()); |
| if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout) |
| options.overrideIndexBitwidth(indexBitwidth); |
| |
| if (useBarePtrCallConv) { |
| options.useBarePtrCallConv = true; |
| WalkResult canUseBarePointers = |
| m.walk([](gpu::GPUFuncOp func) -> WalkResult { |
| if (canBeCalledWithBarePointers(func)) |
| return WalkResult::advance(); |
| return WalkResult::interrupt(); |
| }); |
| if (canUseBarePointers.wasInterrupted()) { |
| emitError(UnknownLoc::get(ctx), |
| "bare pointer calling convention requires all memrefs to " |
| "have static shape and use the identity map"); |
| return signalPassFailure(); |
| } |
| } |
| |
| // Apply in-dialect lowering. In-dialect lowering will replace |
| // ops which need to be lowered further, which is not supported by a |
| // single conversion pass. |
| { |
| RewritePatternSet patterns(ctx); |
| populateGpuRewritePatterns(patterns); |
| arith::populateExpandBFloat16Patterns(patterns); |
| (void)applyPatternsGreedily(m, std::move(patterns)); |
| } |
| |
| LLVMTypeConverter converter(ctx, options); |
| populateGpuMemorySpaceAttributeConversions( |
| converter, [](gpu::AddressSpace space) { |
| switch (space) { |
| case gpu::AddressSpace::Global: |
| return 1; |
| case gpu::AddressSpace::Workgroup: |
| return 3; |
| case gpu::AddressSpace::Private: |
| return 5; |
| } |
| llvm_unreachable("unknown address space enum value"); |
| return 0; |
| }); |
| |
| RewritePatternSet llvmPatterns(ctx); |
| LLVMConversionTarget target(getContext()); |
| |
| llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(), |
| allowedDialects.end()); |
| for (Dialect *dialect : ctx->getLoadedDialects()) { |
| bool allowed = allowedDialectsSet.contains(dialect->getNamespace()); |
| // Empty `allowedDialectsSet` means all dialects are allowed. |
| if (!allowedDialectsSet.empty() && !allowed) |
| continue; |
| |
| auto iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect); |
| if (!iface) { |
| // Error out if dialect was explicily specified but doesn't implement |
| // conversion interface. |
| if (allowed) { |
| m.emitError() |
| << "dialect does not implement ConvertToLLVMPatternInterface: " |
| << dialect->getNamespace(); |
| return signalPassFailure(); |
| } |
| continue; |
| } |
| |
| iface->populateConvertToLLVMConversionPatterns(target, converter, |
| llvmPatterns); |
| } |
| |
| populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns, |
| *maybeChipset); |
| populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime, |
| *maybeChipset); |
| configureGpuToROCDLConversionLegality(target); |
| if (failed(applyPartialConversion(m, target, std::move(llvmPatterns)))) |
| signalPassFailure(); |
| auto *rocdlDialect = getContext().getLoadedDialect<ROCDL::ROCDLDialect>(); |
| auto reqdWorkGroupSizeAttrHelper = |
| rocdlDialect->getReqdWorkGroupSizeAttrHelper(); |
| auto flatWorkGroupSizeAttrHelper = |
| rocdlDialect->getFlatWorkGroupSizeAttrHelper(); |
| // Manually rewrite known block size attributes so the LLVMIR translation |
| // infrastructure can pick them up. |
| m.walk([&](LLVM::LLVMFuncOp op) { |
| if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) { |
| auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op); |
| // Also set up the rocdl.flat_work_group_size attribute to prevent |
| // conflicting metadata. |
| uint32_t flatSize = 1; |
| for (uint32_t size : blockSizes.asArrayRef()) { |
| flatSize *= size; |
| } |
| StringAttr flatSizeAttr = |
| StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize)); |
| flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr); |
| } |
| }); |
| } |
| }; |
| |
| } // namespace |
| |
| void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) { |
| target.addIllegalOp<func::FuncOp>(); |
| target.addLegalDialect<::mlir::LLVM::LLVMDialect>(); |
| target.addLegalDialect<ROCDL::ROCDLDialect>(); |
| target.addIllegalDialect<gpu::GPUDialect>(); |
| target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp, |
| LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op, |
| LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>(); |
| // These ops are legal for f32 type. |
| target.addDynamicallyLegalOp<LLVM::ExpOp, LLVM::LogOp>([](Operation *op) { |
| return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>); |
| }); |
| // TODO: Remove once we support replacing non-root ops. |
| target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>(); |
| } |
| |
| void mlir::populateGpuToROCDLConversionPatterns( |
| const LLVMTypeConverter &converter, RewritePatternSet &patterns, |
| mlir::gpu::amd::Runtime runtime, amdgpu::Chipset chipset) { |
| using gpu::index_lowering::IndexKind; |
| using gpu::index_lowering::IntrType; |
| using mlir::gpu::amd::Runtime; |
| auto *rocdlDialect = |
| converter.getContext().getLoadedDialect<ROCDL::ROCDLDialect>(); |
| populateWithGenerated(patterns); |
| patterns.add< |
| gpu::index_lowering::OpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp, |
| ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>( |
| converter, IndexKind::Block, IntrType::Id); |
| patterns.add<gpu::index_lowering::OpLowering< |
| gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>( |
| converter, IndexKind::Grid, IntrType::Id); |
| patterns.add< |
| gpu::index_lowering::OpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp, |
| ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>( |
| converter, IndexKind::Block, IntrType::Dim); |
| patterns.add<gpu::index_lowering::OpLowering< |
| gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>( |
| converter, IndexKind::Grid, IntrType::Dim); |
| patterns.add<GPUReturnOpLowering>(converter); |
| patterns.add<GPUFuncOpLowering>( |
| converter, |
| GPUFuncOpLoweringOptions{ |
| /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace, |
| /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace, |
| rocdlDialect->getKernelAttrHelper().getName(), |
| rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()}); |
| if (Runtime::HIP == runtime) { |
| patterns.add<GPUPrintfOpToHIPLowering>(converter); |
| } else if (Runtime::OpenCL == runtime) { |
| // Use address space = 4 to match the OpenCL definition of printf() |
| patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /*addressSpace=*/4); |
| } |
| // TODO: Add alignment for workgroup memory |
| patterns.add<GPUDynamicSharedMemoryOpLowering>(converter); |
| |
| patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter); |
| patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset); |
| |
| populateMathToROCDLConversionPatterns(converter, patterns); |
| } |
| |
| std::unique_ptr<OperationPass<gpu::GPUModuleOp>> |
| mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset, |
| unsigned indexBitwidth, |
| bool useBarePtrCallConv, |
| gpu::amd::Runtime runtime) { |
| return std::make_unique<LowerGpuOpsToROCDLOpsPass>( |
| chipset, indexBitwidth, useBarePtrCallConv, runtime); |
| } |