blob: dd16ec4b73e9fbc99f95a90742ec272717f0853d [file] [log] [blame]
//===- LowerGpuOpsToROCDLOps.cpp - MLIR GPU to ROCDL lowering passes ------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements a pass to generate ROCDLIR operations for higher-level
// GPU operations.
//
//===----------------------------------------------------------------------===//
#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
#include "mlir/Dialect/Arith/Transforms/Passes.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Transforms/Passes.h"
#include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
#include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
#include "mlir/Conversion/ConvertToLLVM/ToLLVMPass.h"
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
#include "mlir/Conversion/MathToROCDL/MathToROCDL.h"
#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/Dialect/Math/IR/Math.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "llvm/Support/FormatVariadic.h"
#include "../GPUCommon/GPUOpsLowering.h"
#include "../GPUCommon/IndexIntrinsicsOpLowering.h"
namespace mlir {
#define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
#include "mlir/Conversion/Passes.h.inc"
} // namespace mlir
using namespace mlir;
// Truncate or extend the result depending on the index bitwidth specified
// by the LLVMTypeConverter options.
static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,
Location loc, Value value,
const LLVMTypeConverter &converter) {
int64_t intWidth = cast<IntegerType>(value.getType()).getWidth();
int64_t indexBitwidth = converter.getIndexTypeBitwidth();
auto indexBitwidthType =
IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
// TODO: use <=> in C++20.
if (indexBitwidth > intWidth) {
return rewriter.create<LLVM::SExtOp>(loc, indexBitwidthType, value);
}
if (indexBitwidth < intWidth) {
return rewriter.create<LLVM::TruncOp>(loc, indexBitwidthType, value);
}
return value;
}
/// Returns true if the given `gpu.func` can be safely called using the bare
/// pointer calling convention.
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
bool canBeBare = true;
for (Type type : func.getArgumentTypes())
if (auto memrefTy = dyn_cast<BaseMemRefType>(type))
canBeBare &= LLVMTypeConverter::canConvertToBarePtr(memrefTy);
return canBeBare;
}
static Value getLaneId(ConversionPatternRewriter &rewriter, Location loc,
const unsigned indexBitwidth) {
auto int32Type = IntegerType::get(rewriter.getContext(), 32);
Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 32);
Value minus1 = rewriter.create<arith::ConstantIntOp>(loc, -1, 32);
Value mbcntLo = rewriter.create<ROCDL::MbcntLoOp>(loc, int32Type,
ValueRange{minus1, zero});
Value laneId = rewriter.create<ROCDL::MbcntHiOp>(loc, int32Type,
ValueRange{minus1, mbcntLo});
return laneId;
}
static constexpr StringLiteral amdgcnDataLayout =
"e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
"-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:"
"32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"
"64-S32-A5-G1-ni:7:8:9";
namespace {
struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;
LogicalResult
matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
auto loc = op->getLoc();
MLIRContext *context = rewriter.getContext();
// convert to: %mlo = call @llvm.amdgcn.mbcnt.lo(-1, 0)
// followed by: %lid = call @llvm.amdgcn.mbcnt.hi(-1, %mlo)
Type intTy = IntegerType::get(context, 32);
Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 32);
Value minus1 = rewriter.create<arith::ConstantIntOp>(loc, -1, 32);
Value mbcntLo =
rewriter.create<ROCDL::MbcntLoOp>(loc, intTy, ValueRange{minus1, zero});
Value laneId = rewriter.create<ROCDL::MbcntHiOp>(
loc, intTy, ValueRange{minus1, mbcntLo});
// Truncate or extend the result depending on the index bitwidth specified
// by the LLVMTypeConverter options.
const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
if (indexBitwidth > 32) {
laneId = rewriter.create<LLVM::SExtOp>(
loc, IntegerType::get(context, indexBitwidth), laneId);
} else if (indexBitwidth < 32) {
laneId = rewriter.create<LLVM::TruncOp>(
loc, IntegerType::get(context, indexBitwidth), laneId);
}
rewriter.replaceOp(op, {laneId});
return success();
}
};
struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
GPUSubgroupSizeOpToROCDL(const LLVMTypeConverter &converter,
amdgpu::Chipset chipset)
: ConvertOpToLLVMPattern<gpu::SubgroupSizeOp>(converter),
chipset(chipset) {}
LogicalResult
matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
LLVM::ConstantRangeAttr bounds = nullptr;
bool isBeforeGfx10 = chipset.majorVersion < 10;
if (auto upperBoundAttr = op.getUpperBoundAttr()) {
bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
/*bitWidth=*/32, /*lower=*/isBeforeGfx10 ? 64 : 32,
/*upper=*/op.getUpperBoundAttr().getInt() + 1);
}
Value wavefrontOp = rewriter.create<ROCDL::WavefrontSizeOp>(
op.getLoc(), rewriter.getI32Type(), bounds);
wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp,
*getTypeConverter());
rewriter.replaceOp(op, {wavefrontOp});
return success();
}
const amdgpu::Chipset chipset;
};
struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
/// Lowers a shuffle to the corresponding ROCDL ops.
///
/// Use the `width` argument to see if src lane is participating.
/// If not the dstLane would be itself.
///
/// Shuffle with DS Bpermute:
/// let shflMode = [xor, up, down, idx]
/// let width = 32(usually warpsize), step = [1, 2, 4, 8, 16, ... , width].
/// 1. curLaneId = using mbcnt.lo + mbcnt.hi
/// 2. widthOrZeroIfOutside = (curLaneId + width) & -width
/// 3. dstLane = shflMode(curLaneId, step)
/// 4. isActiveSrcLane = dstLane < isActiveSrcLane
/// 5. dstLane = isActiveSrcLane ? dstLane : curLaneId
/// 6. dwordAlignedDstLane = dstLane * 4 or dstLane << 2.
/// 7. bpermute(dwordAlignedDstLane, shfl_value).
///
LogicalResult
matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
Location loc = op->getLoc();
Value initShflValue = adaptor.getValue();
const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);
auto int32Type = IntegerType::get(rewriter.getContext(), 32);
Value width = adaptor.getWidth();
Value zero = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 0);
Value negwidth = rewriter.create<LLVM::SubOp>(loc, int32Type, zero, width);
Value add = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, width);
Value widthOrZeroIfOutside =
rewriter.create<LLVM::AndOp>(loc, int32Type, add, negwidth);
Value dstLane;
// TODO: Use ds_swizzle for XOR when step/offsets are constants for better
// perf.
switch (op.getMode()) {
case gpu::ShuffleMode::UP:
dstLane = rewriter.create<LLVM::SubOp>(loc, int32Type, srcLaneId,
adaptor.getOffset());
break;
case gpu::ShuffleMode::DOWN:
dstLane = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId,
adaptor.getOffset());
break;
case gpu::ShuffleMode::XOR:
dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId,
adaptor.getOffset());
break;
case gpu::ShuffleMode::IDX:
dstLane = adaptor.getOffset();
break;
}
Value isActiveSrcLane = rewriter.create<LLVM::ICmpOp>(
loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
Value selectDstLane = rewriter.create<LLVM::SelectOp>(loc, isActiveSrcLane,
dstLane, srcLaneId);
Value two = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 2);
Value dwordAlignedDstLane =
rewriter.create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two);
SmallVector<Value> decomposed =
LLVM::decomposeValue(rewriter, loc, initShflValue, int32Type);
SmallVector<Value> swizzled;
for (Value v : decomposed) {
Value res = rewriter.create<ROCDL::DsBpermuteOp>(loc, int32Type,
dwordAlignedDstLane, v);
swizzled.emplace_back(res);
}
Value shflValue =
LLVM::composeValue(rewriter, loc, swizzled, initShflValue.getType());
rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
return success();
}
};
/// Import the GPU Ops to ROCDL Patterns.
#include "GPUToROCDL.cpp.inc"
// A pass that replaces all occurrences of GPU device operations with their
// corresponding ROCDL equivalent.
//
// This pass only handles device code and is not meant to be run on GPU host
// code.
struct LowerGpuOpsToROCDLOpsPass final
: public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
LowerGpuOpsToROCDLOpsPass() = default;
LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth,
bool useBarePtrCallConv,
gpu::amd::Runtime runtime) {
if (this->chipset.getNumOccurrences() == 0)
this->chipset = chipset;
if (this->indexBitwidth.getNumOccurrences() == 0)
this->indexBitwidth = indexBitwidth;
if (this->useBarePtrCallConv.getNumOccurrences() == 0)
this->useBarePtrCallConv = useBarePtrCallConv;
if (this->runtime.getNumOccurrences() == 0)
this->runtime = runtime;
}
void getDependentDialects(DialectRegistry &registry) const override {
Base::getDependentDialects(registry);
registerConvertToLLVMDependentDialectLoading(registry);
}
void runOnOperation() override {
gpu::GPUModuleOp m = getOperation();
MLIRContext *ctx = m.getContext();
auto llvmDataLayout = m->getAttrOfType<StringAttr>(
LLVM::LLVMDialect::getDataLayoutAttrName());
if (!llvmDataLayout) {
llvmDataLayout = StringAttr::get(ctx, amdgcnDataLayout);
m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
}
// Request C wrapper emission.
for (auto func : m.getOps<func::FuncOp>()) {
func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
UnitAttr::get(ctx));
}
FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
if (failed(maybeChipset)) {
emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
return signalPassFailure();
}
/// Customize the bitwidth used for the device side index computations.
LowerToLLVMOptions options(
ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
options.overrideIndexBitwidth(indexBitwidth);
if (useBarePtrCallConv) {
options.useBarePtrCallConv = true;
WalkResult canUseBarePointers =
m.walk([](gpu::GPUFuncOp func) -> WalkResult {
if (canBeCalledWithBarePointers(func))
return WalkResult::advance();
return WalkResult::interrupt();
});
if (canUseBarePointers.wasInterrupted()) {
emitError(UnknownLoc::get(ctx),
"bare pointer calling convention requires all memrefs to "
"have static shape and use the identity map");
return signalPassFailure();
}
}
// Apply in-dialect lowering. In-dialect lowering will replace
// ops which need to be lowered further, which is not supported by a
// single conversion pass.
{
RewritePatternSet patterns(ctx);
populateGpuRewritePatterns(patterns);
arith::populateExpandBFloat16Patterns(patterns);
(void)applyPatternsGreedily(m, std::move(patterns));
}
LLVMTypeConverter converter(ctx, options);
populateGpuMemorySpaceAttributeConversions(
converter, [](gpu::AddressSpace space) {
switch (space) {
case gpu::AddressSpace::Global:
return 1;
case gpu::AddressSpace::Workgroup:
return 3;
case gpu::AddressSpace::Private:
return 5;
}
llvm_unreachable("unknown address space enum value");
return 0;
});
RewritePatternSet llvmPatterns(ctx);
LLVMConversionTarget target(getContext());
llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
allowedDialects.end());
for (Dialect *dialect : ctx->getLoadedDialects()) {
bool allowed = allowedDialectsSet.contains(dialect->getNamespace());
// Empty `allowedDialectsSet` means all dialects are allowed.
if (!allowedDialectsSet.empty() && !allowed)
continue;
auto iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
if (!iface) {
// Error out if dialect was explicily specified but doesn't implement
// conversion interface.
if (allowed) {
m.emitError()
<< "dialect does not implement ConvertToLLVMPatternInterface: "
<< dialect->getNamespace();
return signalPassFailure();
}
continue;
}
iface->populateConvertToLLVMConversionPatterns(target, converter,
llvmPatterns);
}
populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
*maybeChipset);
populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime,
*maybeChipset);
configureGpuToROCDLConversionLegality(target);
if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
signalPassFailure();
auto *rocdlDialect = getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
auto reqdWorkGroupSizeAttrHelper =
rocdlDialect->getReqdWorkGroupSizeAttrHelper();
auto flatWorkGroupSizeAttrHelper =
rocdlDialect->getFlatWorkGroupSizeAttrHelper();
// Manually rewrite known block size attributes so the LLVMIR translation
// infrastructure can pick them up.
m.walk([&](LLVM::LLVMFuncOp op) {
if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {
auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);
// Also set up the rocdl.flat_work_group_size attribute to prevent
// conflicting metadata.
uint32_t flatSize = 1;
for (uint32_t size : blockSizes.asArrayRef()) {
flatSize *= size;
}
StringAttr flatSizeAttr =
StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize));
flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr);
}
});
}
};
} // namespace
void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
target.addIllegalOp<func::FuncOp>();
target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
target.addLegalDialect<ROCDL::ROCDLDialect>();
target.addIllegalDialect<gpu::GPUDialect>();
target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,
LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,
LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
// These ops are legal for f32 type.
target.addDynamicallyLegalOp<LLVM::ExpOp, LLVM::LogOp>([](Operation *op) {
return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
});
// TODO: Remove once we support replacing non-root ops.
target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
}
void mlir::populateGpuToROCDLConversionPatterns(
const LLVMTypeConverter &converter, RewritePatternSet &patterns,
mlir::gpu::amd::Runtime runtime, amdgpu::Chipset chipset) {
using gpu::index_lowering::IndexKind;
using gpu::index_lowering::IntrType;
using mlir::gpu::amd::Runtime;
auto *rocdlDialect =
converter.getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
populateWithGenerated(patterns);
patterns.add<
gpu::index_lowering::OpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
converter, IndexKind::Block, IntrType::Id);
patterns.add<gpu::index_lowering::OpLowering<
gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
converter, IndexKind::Grid, IntrType::Id);
patterns.add<
gpu::index_lowering::OpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(
converter, IndexKind::Block, IntrType::Dim);
patterns.add<gpu::index_lowering::OpLowering<
gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
converter, IndexKind::Grid, IntrType::Dim);
patterns.add<GPUReturnOpLowering>(converter);
patterns.add<GPUFuncOpLowering>(
converter,
GPUFuncOpLoweringOptions{
/*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
/*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
rocdlDialect->getKernelAttrHelper().getName(),
rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()});
if (Runtime::HIP == runtime) {
patterns.add<GPUPrintfOpToHIPLowering>(converter);
} else if (Runtime::OpenCL == runtime) {
// Use address space = 4 to match the OpenCL definition of printf()
patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /*addressSpace=*/4);
}
// TODO: Add alignment for workgroup memory
patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
populateMathToROCDLConversionPatterns(converter, patterns);
}
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset,
unsigned indexBitwidth,
bool useBarePtrCallConv,
gpu::amd::Runtime runtime) {
return std::make_unique<LowerGpuOpsToROCDLOpsPass>(
chipset, indexBitwidth, useBarePtrCallConv, runtime);
}