blob: 5b7fa13f2e835ff40d1b264ce85452e23450ccf3 [file] [log] [blame]
//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// \file
// This file implements a TargetTransformInfo analysis pass specific to the
// AMDGPU target machine. It uses the target's detailed information to provide
// more precise answers to certain TTI queries, while letting the target
// independent and default TTI implementations handle the rest.
//
//===----------------------------------------------------------------------===//
#include "AMDGPUInstrInfo.h"
#include "AMDGPUTargetTransformInfo.h"
#include "GCNSubtarget.h"
#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
#include <optional>
using namespace llvm;
using namespace llvm::PatternMatch;
#define DEBUG_TYPE "AMDGPUtti"
namespace {
struct AMDGPUImageDMaskIntrinsic {
unsigned Intr;
};
#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
#include "InstCombineTables.inc"
} // end anonymous namespace
// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
//
// A single NaN input is folded to minnum, so we rely on that folding for
// handling NaNs.
static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
const APFloat &Src2) {
APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
APFloat::cmpResult Cmp0 = Max3.compare(Src0);
assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
if (Cmp0 == APFloat::cmpEqual)
return maxnum(Src1, Src2);
APFloat::cmpResult Cmp1 = Max3.compare(Src1);
assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
if (Cmp1 == APFloat::cmpEqual)
return maxnum(Src0, Src2);
return maxnum(Src0, Src1);
}
// Check if a value can be converted to a 16-bit value without losing
// precision.
// The value is expected to be either a float (IsFloat = true) or an unsigned
// integer (IsFloat = false).
static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
Type *VTy = V.getType();
if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
// The value is already 16-bit, so we don't want to convert to 16-bit again!
return false;
}
if (IsFloat) {
if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
// We need to check that if we cast the index down to a half, we do not
// lose precision.
APFloat FloatValue(ConstFloat->getValueAPF());
bool LosesInfo = true;
FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
&LosesInfo);
return !LosesInfo;
}
} else {
if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
// We need to check that if we cast the index down to an i16, we do not
// lose precision.
APInt IntValue(ConstInt->getValue());
return IntValue.getActiveBits() <= 16;
}
}
Value *CastSrc;
bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
: match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
if (IsExt) {
Type *CastSrcTy = CastSrc->getType();
if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
return true;
}
return false;
}
// Convert a value to 16-bit.
static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
Type *VTy = V.getType();
if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
return cast<Instruction>(&V)->getOperand(0);
if (VTy->isIntegerTy())
return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
if (VTy->isFloatingPointTy())
return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
llvm_unreachable("Should never be called!");
}
/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
/// modified arguments (based on OldIntr) and replaces InstToReplace with
/// this newly created intrinsic call.
static std::optional<Instruction *> modifyIntrinsicCall(
IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
InstCombiner &IC,
std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
Func) {
SmallVector<Type *, 4> ArgTys;
if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys))
return std::nullopt;
SmallVector<Value *, 8> Args(OldIntr.args());
// Modify arguments and types
Func(Args, ArgTys);
Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys);
CallInst *NewCall = IC.Builder.CreateCall(I, Args);
NewCall->takeName(&OldIntr);
NewCall->copyMetadata(OldIntr);
if (isa<FPMathOperator>(NewCall))
NewCall->copyFastMathFlags(&OldIntr);
// Erase and replace uses
if (!InstToReplace.getType()->isVoidTy())
IC.replaceInstUsesWith(InstToReplace, NewCall);
bool RemoveOldIntr = &OldIntr != &InstToReplace;
auto RetValue = IC.eraseInstFromFunction(InstToReplace);
if (RemoveOldIntr)
IC.eraseInstFromFunction(OldIntr);
return RetValue;
}
static std::optional<Instruction *>
simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
IntrinsicInst &II, InstCombiner &IC) {
// Optimize _L to _LZ when _L is zero
if (const auto *LZMappingInfo =
AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
if (auto *ConstantLod =
dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
if (ConstantLod->isZero() || ConstantLod->isNegative()) {
const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
ImageDimIntr->Dim);
return modifyIntrinsicCall(
II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
Args.erase(Args.begin() + ImageDimIntr->LodIndex);
});
}
}
}
// Optimize _mip away, when 'lod' is zero
if (const auto *MIPMappingInfo =
AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
if (auto *ConstantMip =
dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
if (ConstantMip->isZero()) {
const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
ImageDimIntr->Dim);
return modifyIntrinsicCall(
II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
Args.erase(Args.begin() + ImageDimIntr->MipIndex);
});
}
}
}
// Optimize _bias away when 'bias' is zero
if (const auto *BiasMappingInfo =
AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {
if (auto *ConstantBias =
dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
if (ConstantBias->isZero()) {
const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
ImageDimIntr->Dim);
return modifyIntrinsicCall(
II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
});
}
}
}
// Optimize _offset away when 'offset' is zero
if (const auto *OffsetMappingInfo =
AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) {
if (auto *ConstantOffset =
dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
if (ConstantOffset->isZero()) {
const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
AMDGPU::getImageDimIntrinsicByBaseOpcode(
OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
return modifyIntrinsicCall(
II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
});
}
}
}
// Try to use D16
if (ST->hasD16Images()) {
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
if (BaseOpcode->HasD16) {
// If the only use of image intrinsic is a fptrunc (with conversion to
// half) then both fptrunc and image intrinsic will be replaced with image
// intrinsic with D16 flag.
if (II.hasOneUse()) {
Instruction *User = II.user_back();
if (User->getOpcode() == Instruction::FPTrunc &&
User->getType()->getScalarType()->isHalfTy()) {
return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
[&](auto &Args, auto &ArgTys) {
// Change return type of image intrinsic.
// Set it to return type of fptrunc.
ArgTys[0] = User->getType();
});
}
}
}
}
// Try to use A16 or G16
if (!ST->hasA16() && !ST->hasG16())
return std::nullopt;
// Address is interpreted as float if the instruction has a sampler or as
// unsigned int if there is no sampler.
bool HasSampler =
AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
bool FloatCoord = false;
// true means derivatives can be converted to 16 bit, coordinates not
bool OnlyDerivatives = false;
for (unsigned OperandIndex = ImageDimIntr->GradientStart;
OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
Value *Coord = II.getOperand(OperandIndex);
// If the values are not derived from 16-bit values, we cannot optimize.
if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
if (OperandIndex < ImageDimIntr->CoordStart ||
ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
return std::nullopt;
}
// All gradients can be converted, so convert only them
OnlyDerivatives = true;
break;
}
assert(OperandIndex == ImageDimIntr->GradientStart ||
FloatCoord == Coord->getType()->isFloatingPointTy());
FloatCoord = Coord->getType()->isFloatingPointTy();
}
if (!OnlyDerivatives && !ST->hasA16())
OnlyDerivatives = true; // Only supports G16
// Check if there is a bias parameter and if it can be converted to f16
if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
assert(HasSampler &&
"Only image instructions with a sampler can have a bias");
if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
OnlyDerivatives = true;
}
if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
ImageDimIntr->CoordStart))
return std::nullopt;
Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
: Type::getInt16Ty(II.getContext());
return modifyIntrinsicCall(
II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
if (!OnlyDerivatives) {
ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
// Change the bias type
if (ImageDimIntr->NumBiasArgs != 0)
ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
}
unsigned EndIndex =
OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
for (unsigned OperandIndex = ImageDimIntr->GradientStart;
OperandIndex < EndIndex; OperandIndex++) {
Args[OperandIndex] =
convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
}
// Convert the bias
if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
}
});
}
bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,
const Value *Op0, const Value *Op1,
InstCombiner &IC) const {
// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
// infinity, gives +0.0. If we can prove we don't have one of the special
// cases then we can use a normal multiply instead.
// TODO: Create and use isKnownFiniteNonZero instead of just matching
// constants here.
if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
match(Op1, PatternMatch::m_FiniteNonZero())) {
// One operand is not zero or infinity or NaN.
return true;
}
SimplifyQuery SQ = IC.getSimplifyQuery().getWithInstruction(&I);
if (isKnownNeverInfOrNaN(Op0, /*Depth=*/0, SQ) &&
isKnownNeverInfOrNaN(Op1, /*Depth=*/0, SQ)) {
// Neither operand is infinity or NaN.
return true;
}
return false;
}
/// Match an fpext from half to float, or a constant we can convert.
static bool matchFPExtFromF16(Value *Arg, Value *&FPExtSrc) {
if (match(Arg, m_OneUse(m_FPExt(m_Value(FPExtSrc)))))
return FPExtSrc->getType()->isHalfTy();
ConstantFP *CFP;
if (match(Arg, m_ConstantFP(CFP))) {
bool LosesInfo;
APFloat Val(CFP->getValueAPF());
Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
if (LosesInfo)
return false;
FPExtSrc = ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
return true;
}
return false;
}
// Trim all zero components from the end of the vector \p UseV and return
// an appropriate bitset with known elements.
static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
Instruction *I) {
auto *VTy = cast<FixedVectorType>(UseV->getType());
unsigned VWidth = VTy->getNumElements();
APInt DemandedElts = APInt::getAllOnes(VWidth);
for (int i = VWidth - 1; i > 0; --i) {
auto *Elt = findScalarElement(UseV, i);
if (!Elt)
break;
if (auto *ConstElt = dyn_cast<Constant>(Elt)) {
if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt))
break;
} else {
break;
}
DemandedElts.clearBit(i);
}
return DemandedElts;
}
// Trim elements of the end of the vector \p V, if they are
// equal to the first element of the vector.
static APInt defaultComponentBroadcast(Value *V) {
auto *VTy = cast<FixedVectorType>(V->getType());
unsigned VWidth = VTy->getNumElements();
APInt DemandedElts = APInt::getAllOnes(VWidth);
Value *FirstComponent = findScalarElement(V, 0);
SmallVector<int> ShuffleMask;
if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
SVI->getShuffleMask(ShuffleMask);
for (int I = VWidth - 1; I > 0; --I) {
if (ShuffleMask.empty()) {
auto *Elt = findScalarElement(V, I);
if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
break;
} else {
// Detect identical elements in the shufflevector result, even though
// findScalarElement cannot tell us what that element is.
if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
break;
}
DemandedElts.clearBit(I);
}
return DemandedElts;
}
static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
IntrinsicInst &II,
APInt DemandedElts,
int DMaskIdx = -1,
bool IsLoad = true);
/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
return (SqrtOp->getType()->isFloatTy() &&
(SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
SqrtOp->getType()->isHalfTy();
}
std::optional<Instruction *>
GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
Intrinsic::ID IID = II.getIntrinsicID();
switch (IID) {
case Intrinsic::amdgcn_rcp: {
Value *Src = II.getArgOperand(0);
// TODO: Move to ConstantFolding/InstSimplify?
if (isa<UndefValue>(Src)) {
Type *Ty = II.getType();
auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
return IC.replaceInstUsesWith(II, QNaN);
}
if (II.isStrictFP())
break;
if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
const APFloat &ArgVal = C->getValueAPF();
APFloat Val(ArgVal.getSemantics(), 1);
Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
// This is more precise than the instruction may give.
//
// TODO: The instruction always flushes denormal results (except for f16),
// should this also?
return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
}
FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags();
if (!FMF.allowContract())
break;
auto *SrcCI = dyn_cast<IntrinsicInst>(Src);
if (!SrcCI)
break;
auto IID = SrcCI->getIntrinsicID();
// llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
//
// llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
// relaxed.
if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI);
FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
break;
if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
break;
Function *NewDecl = Intrinsic::getDeclaration(
SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});
InnerFMF |= FMF;
II.setFastMathFlags(InnerFMF);
II.setCalledFunction(NewDecl);
return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0));
}
break;
}
case Intrinsic::amdgcn_sqrt:
case Intrinsic::amdgcn_rsq: {
Value *Src = II.getArgOperand(0);
// TODO: Move to ConstantFolding/InstSimplify?
if (isa<UndefValue>(Src)) {
Type *Ty = II.getType();
auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
return IC.replaceInstUsesWith(II, QNaN);
}
// f16 amdgcn.sqrt is identical to regular sqrt.
if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
Function *NewDecl = Intrinsic::getDeclaration(
II.getModule(), Intrinsic::sqrt, {II.getType()});
II.setCalledFunction(NewDecl);
return &II;
}
break;
}
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2: {
const bool IsLog = IID == Intrinsic::amdgcn_log;
const bool IsExp = IID == Intrinsic::amdgcn_exp2;
Value *Src = II.getArgOperand(0);
Type *Ty = II.getType();
if (isa<PoisonValue>(Src))
return IC.replaceInstUsesWith(II, Src);
if (IC.getSimplifyQuery().isUndefValue(Src))
return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));
if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
if (C->isInfinity()) {
// exp2(+inf) -> +inf
// log2(+inf) -> +inf
if (!C->isNegative())
return IC.replaceInstUsesWith(II, C);
// exp2(-inf) -> 0
if (IsExp && C->isNegative())
return IC.replaceInstUsesWith(II, ConstantFP::getZero(Ty));
}
if (II.isStrictFP())
break;
if (C->isNaN()) {
Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet());
return IC.replaceInstUsesWith(II, Quieted);
}
// f32 instruction doesn't handle denormals, f16 does.
if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true)
: ConstantFP::get(Ty, 1.0);
return IC.replaceInstUsesWith(II, FoldedValue);
}
if (IsLog && C->isNegative())
return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));
// TODO: Full constant folding matching hardware behavior.
}
break;
}
case Intrinsic::amdgcn_frexp_mant:
case Intrinsic::amdgcn_frexp_exp: {
Value *Src = II.getArgOperand(0);
if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
int Exp;
APFloat Significand =
frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
if (IID == Intrinsic::amdgcn_frexp_mant) {
return IC.replaceInstUsesWith(
II, ConstantFP::get(II.getContext(), Significand));
}
// Match instruction special case behavior.
if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
Exp = 0;
return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
}
if (isa<UndefValue>(Src)) {
return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
}
break;
}
case Intrinsic::amdgcn_class: {
Value *Src0 = II.getArgOperand(0);
Value *Src1 = II.getArgOperand(1);
const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
if (CMask) {
II.setCalledOperand(Intrinsic::getDeclaration(
II.getModule(), Intrinsic::is_fpclass, Src0->getType()));
// Clamp any excess bits, as they're illegal for the generic intrinsic.
II.setArgOperand(1, ConstantInt::get(Src1->getType(),
CMask->getZExtValue() & fcAllFlags));
return &II;
}
// Propagate poison.
if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1))
return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
// llvm.amdgcn.class(_, undef) -> false
if (IC.getSimplifyQuery().isUndefValue(Src1))
return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
// llvm.amdgcn.class(undef, mask) -> mask != 0
if (IC.getSimplifyQuery().isUndefValue(Src0)) {
Value *CmpMask = IC.Builder.CreateICmpNE(
Src1, ConstantInt::getNullValue(Src1->getType()));
return IC.replaceInstUsesWith(II, CmpMask);
}
break;
}
case Intrinsic::amdgcn_cvt_pkrtz: {
Value *Src0 = II.getArgOperand(0);
Value *Src1 = II.getArgOperand(1);
if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
const fltSemantics &HalfSem =
II.getType()->getScalarType()->getFltSemantics();
bool LosesInfo;
APFloat Val0 = C0->getValueAPF();
APFloat Val1 = C1->getValueAPF();
Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
Constant *Folded =
ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
ConstantFP::get(II.getContext(), Val1)});
return IC.replaceInstUsesWith(II, Folded);
}
}
if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
}
break;
}
case Intrinsic::amdgcn_cvt_pknorm_i16:
case Intrinsic::amdgcn_cvt_pknorm_u16:
case Intrinsic::amdgcn_cvt_pk_i16:
case Intrinsic::amdgcn_cvt_pk_u16: {
Value *Src0 = II.getArgOperand(0);
Value *Src1 = II.getArgOperand(1);
if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
}
break;
}
case Intrinsic::amdgcn_ubfe:
case Intrinsic::amdgcn_sbfe: {
// Decompose simple cases into standard shifts.
Value *Src = II.getArgOperand(0);
if (isa<UndefValue>(Src)) {
return IC.replaceInstUsesWith(II, Src);
}
unsigned Width;
Type *Ty = II.getType();
unsigned IntSize = Ty->getIntegerBitWidth();
ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
if (CWidth) {
Width = CWidth->getZExtValue();
if ((Width & (IntSize - 1)) == 0) {
return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
}
// Hardware ignores high bits, so remove those.
if (Width >= IntSize) {
return IC.replaceOperand(
II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
}
}
unsigned Offset;
ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
if (COffset) {
Offset = COffset->getZExtValue();
if (Offset >= IntSize) {
return IC.replaceOperand(
II, 1,
ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
}
}
bool Signed = IID == Intrinsic::amdgcn_sbfe;
if (!CWidth || !COffset)
break;
// The case of Width == 0 is handled above, which makes this transformation
// safe. If Width == 0, then the ashr and lshr instructions become poison
// value since the shift amount would be equal to the bit size.
assert(Width != 0);
// TODO: This allows folding to undef when the hardware has specific
// behavior?
if (Offset + Width < IntSize) {
Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
: IC.Builder.CreateLShr(Shl, IntSize - Width);
RightShift->takeName(&II);
return IC.replaceInstUsesWith(II, RightShift);
}
Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
: IC.Builder.CreateLShr(Src, Offset);
RightShift->takeName(&II);
return IC.replaceInstUsesWith(II, RightShift);
}
case Intrinsic::amdgcn_exp:
case Intrinsic::amdgcn_exp_row:
case Intrinsic::amdgcn_exp_compr: {
ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
unsigned EnBits = En->getZExtValue();
if (EnBits == 0xf)
break; // All inputs enabled.
bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
bool Changed = false;
for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
(IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
Value *Src = II.getArgOperand(I + 2);
if (!isa<UndefValue>(Src)) {
IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
Changed = true;
}
}
}
if (Changed) {
return &II;
}
break;
}
case Intrinsic::amdgcn_fmed3: {
// Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
// for the shader.
Value *Src0 = II.getArgOperand(0);
Value *Src1 = II.getArgOperand(1);
Value *Src2 = II.getArgOperand(2);
// Checking for NaN before canonicalization provides better fidelity when
// mapping other operations onto fmed3 since the order of operands is
// unchanged.
Value *V = nullptr;
if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
V = IC.Builder.CreateMinNum(Src1, Src2);
} else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
V = IC.Builder.CreateMinNum(Src0, Src2);
} else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
V = IC.Builder.CreateMaxNum(Src0, Src1);
}
if (V) {
if (auto *CI = dyn_cast<CallInst>(V)) {
CI->copyFastMathFlags(&II);
CI->takeName(&II);
}
return IC.replaceInstUsesWith(II, V);
}
bool Swap = false;
// Canonicalize constants to RHS operands.
//
// fmed3(c0, x, c1) -> fmed3(x, c0, c1)
if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
std::swap(Src0, Src1);
Swap = true;
}
if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
std::swap(Src1, Src2);
Swap = true;
}
if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
std::swap(Src0, Src1);
Swap = true;
}
if (Swap) {
II.setArgOperand(0, Src0);
II.setArgOperand(1, Src1);
II.setArgOperand(2, Src2);
return &II;
}
if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
C2->getValueAPF());
return IC.replaceInstUsesWith(
II, ConstantFP::get(IC.Builder.getContext(), Result));
}
}
}
if (!ST->hasMed3_16())
break;
Value *X, *Y, *Z;
// Repeat floating-point width reduction done for minnum/maxnum.
// fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
if (matchFPExtFromF16(Src0, X) && matchFPExtFromF16(Src1, Y) &&
matchFPExtFromF16(Src2, Z)) {
Value *NewCall = IC.Builder.CreateIntrinsic(IID, {X->getType()},
{X, Y, Z}, &II, II.getName());
return new FPExtInst(NewCall, II.getType());
}
break;
}
case Intrinsic::amdgcn_icmp:
case Intrinsic::amdgcn_fcmp: {
const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
// Guard against invalid arguments.
int64_t CCVal = CC->getZExtValue();
bool IsInteger = IID == Intrinsic::amdgcn_icmp;
if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
(!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
CCVal > CmpInst::LAST_FCMP_PREDICATE)))
break;
Value *Src0 = II.getArgOperand(0);
Value *Src1 = II.getArgOperand(1);
if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
if (CCmp->isNullValue()) {
return IC.replaceInstUsesWith(
II, IC.Builder.CreateSExt(CCmp, II.getType()));
}
// The result of V_ICMP/V_FCMP assembly instructions (which this
// intrinsic exposes) is one bit per thread, masked with the EXEC
// register (which contains the bitmask of live threads). So a
// comparison that always returns true is the same as a read of the
// EXEC register.
Function *NewF = Intrinsic::getDeclaration(
II.getModule(), Intrinsic::read_register, II.getType());
Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
MDNode *MD = MDNode::get(II.getContext(), MDArgs);
Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
NewCall->addFnAttr(Attribute::Convergent);
NewCall->takeName(&II);
return IC.replaceInstUsesWith(II, NewCall);
}
// Canonicalize constants to RHS.
CmpInst::Predicate SwapPred =
CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
II.setArgOperand(0, Src1);
II.setArgOperand(1, Src0);
II.setArgOperand(
2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
return &II;
}
if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
break;
// Canonicalize compare eq with true value to compare != 0
// llvm.amdgcn.icmp(zext (i1 x), 1, eq)
// -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
// llvm.amdgcn.icmp(sext (i1 x), -1, eq)
// -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
Value *ExtSrc;
if (CCVal == CmpInst::ICMP_EQ &&
((match(Src1, PatternMatch::m_One()) &&
match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
(match(Src1, PatternMatch::m_AllOnes()) &&
match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
ExtSrc->getType()->isIntegerTy(1)) {
IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
IC.replaceOperand(II, 2,
ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
return &II;
}
CmpInst::Predicate SrcPred;
Value *SrcLHS;
Value *SrcRHS;
// Fold compare eq/ne with 0 from a compare result as the predicate to the
// intrinsic. The typical use is a wave vote function in the library, which
// will be fed from a user code condition compared with 0. Fold in the
// redundant compare.
// llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
// -> llvm.amdgcn.[if]cmp(a, b, pred)
//
// llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
// -> llvm.amdgcn.[if]cmp(a, b, inv pred)
if (match(Src1, PatternMatch::m_Zero()) &&
match(Src0, PatternMatch::m_ZExtOrSExt(
m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
PatternMatch::m_Value(SrcRHS))))) {
if (CCVal == CmpInst::ICMP_EQ)
SrcPred = CmpInst::getInversePredicate(SrcPred);
Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
? Intrinsic::amdgcn_fcmp
: Intrinsic::amdgcn_icmp;
Type *Ty = SrcLHS->getType();
if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
// Promote to next legal integer type.
unsigned Width = CmpType->getBitWidth();
unsigned NewWidth = Width;
// Don't do anything for i1 comparisons.
if (Width == 1)
break;
if (Width <= 16)
NewWidth = 16;
else if (Width <= 32)
NewWidth = 32;
else if (Width <= 64)
NewWidth = 64;
else
break; // Can't handle this.
if (Width != NewWidth) {
IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
if (CmpInst::isSigned(SrcPred)) {
SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
} else {
SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
}
}
} else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
break;
Function *NewF = Intrinsic::getDeclaration(
II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
Value *Args[] = {SrcLHS, SrcRHS,
ConstantInt::get(CC->getType(), SrcPred)};
CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
NewCall->takeName(&II);
return IC.replaceInstUsesWith(II, NewCall);
}
break;
}
case Intrinsic::amdgcn_mbcnt_hi: {
// exec_hi is all 0, so this is just a copy.
if (ST->isWave32())
return IC.replaceInstUsesWith(II, II.getArgOperand(1));
break;
}
case Intrinsic::amdgcn_ballot: {
if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
if (Src->isZero()) {
// amdgcn.ballot(i1 0) is zero.
return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
}
}
if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
// %b64 = call i64 ballot.i64(...)
// =>
// %b32 = call i32 ballot.i32(...)
// %b64 = zext i32 %b32 to i64
Value *Call = IC.Builder.CreateZExt(
IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot,
{IC.Builder.getInt32Ty()},
{II.getArgOperand(0)}),
II.getType());
Call->takeName(&II);
return IC.replaceInstUsesWith(II, Call);
}
break;
}
case Intrinsic::amdgcn_wqm_vote: {
// wqm_vote is identity when the argument is constant.
if (!isa<Constant>(II.getArgOperand(0)))
break;
return IC.replaceInstUsesWith(II, II.getArgOperand(0));
}
case Intrinsic::amdgcn_kill: {
const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
if (!C || !C->getZExtValue())
break;
// amdgcn.kill(i1 1) is a no-op
return IC.eraseInstFromFunction(II);
}
case Intrinsic::amdgcn_update_dpp: {
Value *Old = II.getArgOperand(0);
auto *BC = cast<ConstantInt>(II.getArgOperand(5));
auto *RM = cast<ConstantInt>(II.getArgOperand(3));
auto *BM = cast<ConstantInt>(II.getArgOperand(4));
if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
break;
// If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
}
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlane16_var:
case Intrinsic::amdgcn_permlanex16:
case Intrinsic::amdgcn_permlanex16_var: {
// Discard vdst_in if it's not going to be read.
Value *VDstIn = II.getArgOperand(0);
if (isa<UndefValue>(VDstIn))
break;
// FetchInvalid operand idx.
unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
IID == Intrinsic::amdgcn_permlanex16)
? 4 /* for permlane16 and permlanex16 */
: 3; /* for permlane16_var and permlanex16_var */
// BoundCtrl operand idx.
// For permlane16 and permlanex16 it should be 5
// For Permlane16_var and permlanex16_var it should be 4
unsigned int BcIdx = FiIdx + 1;
ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx));
ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx));
if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
break;
return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
}
case Intrinsic::amdgcn_permlane64:
// A constant value is trivially uniform.
if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
return IC.replaceInstUsesWith(II, C);
}
break;
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane: {
// A constant value is trivially uniform.
if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
return IC.replaceInstUsesWith(II, C);
}
// The rest of these may not be safe if the exec may not be the same between
// the def and use.
Value *Src = II.getArgOperand(0);
Instruction *SrcInst = dyn_cast<Instruction>(Src);
if (SrcInst && SrcInst->getParent() != II.getParent())
break;
// readfirstlane (readfirstlane x) -> readfirstlane x
// readlane (readfirstlane x), y -> readfirstlane x
if (match(Src,
PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
return IC.replaceInstUsesWith(II, Src);
}
if (IID == Intrinsic::amdgcn_readfirstlane) {
// readfirstlane (readlane x, y) -> readlane x, y
if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
return IC.replaceInstUsesWith(II, Src);
}
} else {
// readlane (readlane x, y), y -> readlane x, y
if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
PatternMatch::m_Value(),
PatternMatch::m_Specific(II.getArgOperand(1))))) {
return IC.replaceInstUsesWith(II, Src);
}
}
break;
}
case Intrinsic::amdgcn_fmul_legacy: {
Value *Op0 = II.getArgOperand(0);
Value *Op1 = II.getArgOperand(1);
// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
// infinity, gives +0.0.
// TODO: Move to InstSimplify?
if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
match(Op1, PatternMatch::m_AnyZeroFP()))
return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
// If we can prove we don't have one of the special cases then we can use a
// normal fmul instruction instead.
if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
FMul->takeName(&II);
return IC.replaceInstUsesWith(II, FMul);
}
break;
}
case Intrinsic::amdgcn_fma_legacy: {
Value *Op0 = II.getArgOperand(0);
Value *Op1 = II.getArgOperand(1);
Value *Op2 = II.getArgOperand(2);
// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
// infinity, gives +0.0.
// TODO: Move to InstSimplify?
if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
match(Op1, PatternMatch::m_AnyZeroFP())) {
// It's tempting to just return Op2 here, but that would give the wrong
// result if Op2 was -0.0.
auto *Zero = ConstantFP::getZero(II.getType());
auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
FAdd->takeName(&II);
return IC.replaceInstUsesWith(II, FAdd);
}
// If we can prove we don't have one of the special cases then we can use a
// normal fma instead.
if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
II.setCalledOperand(Intrinsic::getDeclaration(
II.getModule(), Intrinsic::fma, II.getType()));
return &II;
}
break;
}
case Intrinsic::amdgcn_is_shared:
case Intrinsic::amdgcn_is_private: {
if (isa<UndefValue>(II.getArgOperand(0)))
return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
if (isa<ConstantPointerNull>(II.getArgOperand(0)))
return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
break;
}
case Intrinsic::amdgcn_buffer_store_format:
case Intrinsic::amdgcn_raw_buffer_store_format:
case Intrinsic::amdgcn_struct_buffer_store_format:
case Intrinsic::amdgcn_raw_tbuffer_store:
case Intrinsic::amdgcn_struct_tbuffer_store:
case Intrinsic::amdgcn_tbuffer_store:
case Intrinsic::amdgcn_image_store_1d:
case Intrinsic::amdgcn_image_store_1darray:
case Intrinsic::amdgcn_image_store_2d:
case Intrinsic::amdgcn_image_store_2darray:
case Intrinsic::amdgcn_image_store_2darraymsaa:
case Intrinsic::amdgcn_image_store_2dmsaa:
case Intrinsic::amdgcn_image_store_3d:
case Intrinsic::amdgcn_image_store_cube:
case Intrinsic::amdgcn_image_store_mip_1d:
case Intrinsic::amdgcn_image_store_mip_1darray:
case Intrinsic::amdgcn_image_store_mip_2d:
case Intrinsic::amdgcn_image_store_mip_2darray:
case Intrinsic::amdgcn_image_store_mip_3d:
case Intrinsic::amdgcn_image_store_mip_cube: {
if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
break;
APInt DemandedElts;
if (ST->hasDefaultComponentBroadcast())
DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
else if (ST->hasDefaultComponentZero())
DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
else
break;
int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
false)) {
return IC.eraseInstFromFunction(II);
}
break;
}
}
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
}
return std::nullopt;
}
/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
///
/// The result of simplifying amdgcn image and buffer store intrinsics is updating
/// definitions of the intrinsics vector argument, not Uses of the result like
/// image and buffer loads.
/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
/// struct returns.
static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
IntrinsicInst &II,
APInt DemandedElts,
int DMaskIdx, bool IsLoad) {
auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
: II.getOperand(0)->getType());
unsigned VWidth = IIVTy->getNumElements();
if (VWidth == 1)
return nullptr;
Type *EltTy = IIVTy->getElementType();
IRBuilderBase::InsertPointGuard Guard(IC.Builder);
IC.Builder.SetInsertPoint(&II);
// Assume the arguments are unchanged and later override them, if needed.
SmallVector<Value *, 16> Args(II.args());
if (DMaskIdx < 0) {
// Buffer case.
const unsigned ActiveBits = DemandedElts.getActiveBits();
const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
// Start assuming the prefix of elements is demanded, but possibly clear
// some other bits if there are trailing zeros (unused components at front)
// and update offset.
DemandedElts = (1 << ActiveBits) - 1;
if (UnusedComponentsAtFront > 0) {
static const unsigned InvalidOffsetIdx = 0xf;
unsigned OffsetIdx;
switch (II.getIntrinsicID()) {
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_ptr_buffer_load:
OffsetIdx = 1;
break;
case Intrinsic::amdgcn_s_buffer_load:
// If resulting type is vec3, there is no point in trimming the
// load with updated offset, as the vec3 would most likely be widened to
// vec4 anyway during lowering.
if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
OffsetIdx = InvalidOffsetIdx;
else
OffsetIdx = 1;
break;
case Intrinsic::amdgcn_struct_buffer_load:
case Intrinsic::amdgcn_struct_ptr_buffer_load:
OffsetIdx = 2;
break;
default:
// TODO: handle tbuffer* intrinsics.
OffsetIdx = InvalidOffsetIdx;
break;
}
if (OffsetIdx != InvalidOffsetIdx) {
// Clear demanded bits and update the offset.
DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
auto *Offset = Args[OffsetIdx];
unsigned SingleComponentSizeInBits =
IC.getDataLayout().getTypeSizeInBits(EltTy);
unsigned OffsetAdd =
UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
}
}
} else {
// Image case.
ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
unsigned DMaskVal = DMask->getZExtValue() & 0xf;
// dmask 0 has special semantics, do not simplify.
if (DMaskVal == 0)
return nullptr;
// Mask off values that are undefined because the dmask doesn't cover them
DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
unsigned NewDMaskVal = 0;
unsigned OrigLdStIdx = 0;
for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
const unsigned Bit = 1 << SrcIdx;
if (!!(DMaskVal & Bit)) {
if (!!DemandedElts[OrigLdStIdx])
NewDMaskVal |= Bit;
OrigLdStIdx++;
}
}
if (DMaskVal != NewDMaskVal)
Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
}
unsigned NewNumElts = DemandedElts.popcount();
if (!NewNumElts)
return PoisonValue::get(IIVTy);
if (NewNumElts >= VWidth && DemandedElts.isMask()) {
if (DMaskIdx >= 0)
II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
return nullptr;
}
// Validate function argument and return types, extracting overloaded types
// along the way.
SmallVector<Type *, 6> OverloadTys;
if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
return nullptr;
Type *NewTy =
(NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
OverloadTys[0] = NewTy;
if (!IsLoad) {
SmallVector<int, 8> EltMask;
for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
if (DemandedElts[OrigStoreIdx])
EltMask.push_back(OrigStoreIdx);
if (NewNumElts == 1)
Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
else
Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
}
Function *NewIntrin = Intrinsic::getDeclaration(
II.getModule(), II.getIntrinsicID(), OverloadTys);
CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
NewCall->takeName(&II);
NewCall->copyMetadata(II);
if (IsLoad) {
if (NewNumElts == 1) {
return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,
DemandedElts.countr_zero());
}
SmallVector<int, 8> EltMask;
unsigned NewLoadIdx = 0;
for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
if (!!DemandedElts[OrigLoadIdx])
EltMask.push_back(NewLoadIdx++);
else
EltMask.push_back(NewNumElts);
}
auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
return Shuffle;
}
return NewCall;
}
std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
APInt &UndefElts2, APInt &UndefElts3,
std::function<void(Instruction *, unsigned, APInt, APInt &)>
SimplifyAndSetOp) const {
switch (II.getIntrinsicID()) {
case Intrinsic::amdgcn_buffer_load:
case Intrinsic::amdgcn_buffer_load_format:
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_ptr_buffer_load:
case Intrinsic::amdgcn_raw_buffer_load_format:
case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
case Intrinsic::amdgcn_raw_tbuffer_load:
case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
case Intrinsic::amdgcn_s_buffer_load:
case Intrinsic::amdgcn_struct_buffer_load:
case Intrinsic::amdgcn_struct_ptr_buffer_load:
case Intrinsic::amdgcn_struct_buffer_load_format:
case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
case Intrinsic::amdgcn_struct_tbuffer_load:
case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
case Intrinsic::amdgcn_tbuffer_load:
return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
default: {
if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
}
break;
}
}
return std::nullopt;
}