llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp - llvm-project - Git at Google

 //===- MVEGatherScatterLowering.cpp - Gather/Scatter lowering -------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 /// This pass custom lowers llvm.gather and llvm.scatter instructions to
 /// arm.mve.gather and arm.mve.scatter intrinsics, optimising the code to
 /// produce a better final result as we go.
 //
 //===----------------------------------------------------------------------===//

 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMSubtarget.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>

 using namespace llvm;

 #define DEBUG_TYPE "arm-mve-gather-scatter-lowering"

 cl::opt<bool> EnableMaskedGatherScatters(
     "enable-arm-maskedgatscat", cl::Hidden, cl::init(true),
     cl::desc("Enable the generation of masked gathers and scatters"));

 namespace {

 class MVEGatherScatterLowering : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid

   explicit MVEGatherScatterLowering() : FunctionPass(ID) {
     initializeMVEGatherScatterLoweringPass(*PassRegistry::getPassRegistry());
   }

   bool runOnFunction(Function &F) override;

   StringRef getPassName() const override {
     return "MVE gather/scatter lowering";
   }

   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<TargetPassConfig>();
     AU.addRequired<LoopInfoWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }

 private:
   LoopInfo *LI = nullptr;
   const DataLayout *DL;

   // Check this is a valid gather with correct alignment
   bool isLegalTypeAndAlignment(unsigned NumElements, unsigned ElemSize,
                                Align Alignment);
   // Check whether Ptr is hidden behind a bitcast and look through it
   void lookThroughBitcast(Value *&Ptr);
   // Decompose a ptr into Base and Offsets, potentially using a GEP to return a
   // scalar base and vector offsets, or else fallback to using a base of 0 and
   // offset of Ptr where possible.
   Value *decomposePtr(Value *Ptr, Value *&Offsets, int &Scale,
                       FixedVectorType *Ty, Type *MemoryTy,
                       IRBuilder<> &Builder);
   // Check for a getelementptr and deduce base and offsets from it, on success
   // returning the base directly and the offsets indirectly using the Offsets
   // argument
   Value *decomposeGEP(Value *&Offsets, FixedVectorType *Ty,
                       GetElementPtrInst *GEP, IRBuilder<> &Builder);
   // Compute the scale of this gather/scatter instruction
   int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize);
   // If the value is a constant, or derived from constants via additions
   // and multilications, return its numeric value
   Optional<int64_t> getIfConst(const Value *V);
   // If Inst is an add instruction, check whether one summand is a
   // constant. If so, scale this constant and return it together with
   // the other summand.
   std::pair<Value *, int64_t> getVarAndConst(Value *Inst, int TypeScale);

   Instruction *lowerGather(IntrinsicInst *I);
   // Create a gather from a base + vector of offsets
   Instruction *tryCreateMaskedGatherOffset(IntrinsicInst *I, Value *Ptr,
                                            Instruction *&Root,
                                            IRBuilder<> &Builder);
   // Create a gather from a vector of pointers
   Instruction *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr,
                                          IRBuilder<> &Builder,
                                          int64_t Increment = 0);
   // Create an incrementing gather from a vector of pointers
   Instruction *tryCreateMaskedGatherBaseWB(IntrinsicInst *I, Value *Ptr,
                                            IRBuilder<> &Builder,
                                            int64_t Increment = 0);

   Instruction *lowerScatter(IntrinsicInst *I);
   // Create a scatter to a base + vector of offsets
   Instruction *tryCreateMaskedScatterOffset(IntrinsicInst *I, Value *Offsets,
                                             IRBuilder<> &Builder);
   // Create a scatter to a vector of pointers
   Instruction *tryCreateMaskedScatterBase(IntrinsicInst *I, Value *Ptr,
                                           IRBuilder<> &Builder,
                                           int64_t Increment = 0);
   // Create an incrementing scatter from a vector of pointers
   Instruction *tryCreateMaskedScatterBaseWB(IntrinsicInst *I, Value *Ptr,
                                             IRBuilder<> &Builder,
                                             int64_t Increment = 0);

   // QI gathers and scatters can increment their offsets on their own if
   // the increment is a constant value (digit)
   Instruction *tryCreateIncrementingGatScat(IntrinsicInst *I, Value *Ptr,
                                             IRBuilder<> &Builder);
   // QI gathers/scatters can increment their offsets on their own if the
   // increment is a constant value (digit) - this creates a writeback QI
   // gather/scatter
   Instruction *tryCreateIncrementingWBGatScat(IntrinsicInst *I, Value *BasePtr,
                                               Value *Ptr, unsigned TypeScale,
                                               IRBuilder<> &Builder);

   // Optimise the base and offsets of the given address
   bool optimiseAddress(Value *Address, BasicBlock *BB, LoopInfo *LI);
   // Try to fold consecutive geps together into one
   Value *foldGEP(GetElementPtrInst *GEP, Value *&Offsets, IRBuilder<> &Builder);
   // Check whether these offsets could be moved out of the loop they're in
   bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI);
   // Pushes the given add out of the loop
   void pushOutAdd(PHINode *&Phi, Value *OffsSecondOperand, unsigned StartIndex);
   // Pushes the given mul or shl out of the loop
   void pushOutMulShl(unsigned Opc, PHINode *&Phi, Value *IncrementPerRound,
                      Value *OffsSecondOperand, unsigned LoopIncrement,
                      IRBuilder<> &Builder);
 };

 } // end anonymous namespace

 char MVEGatherScatterLowering::ID = 0;

 INITIALIZE_PASS(MVEGatherScatterLowering, DEBUG_TYPE,
                 "MVE gather/scattering lowering pass", false, false)

 Pass *llvm::createMVEGatherScatterLoweringPass() {
   return new MVEGatherScatterLowering();
 }

 bool MVEGatherScatterLowering::isLegalTypeAndAlignment(unsigned NumElements,
                                                        unsigned ElemSize,
                                                        Align Alignment) {
   if (((NumElements == 4 &&
         (ElemSize == 32 || ElemSize == 16 || ElemSize == 8)) ||
        (NumElements == 8 && (ElemSize == 16 || ElemSize == 8)) ||
        (NumElements == 16 && ElemSize == 8)) &&
       Alignment >= ElemSize / 8)
     return true;
   LLVM_DEBUG(dbgs() << "masked gathers/scatters: instruction does not have "
                     << "valid alignment or vector type \n");
   return false;
 }

 static bool checkOffsetSize(Value *Offsets, unsigned TargetElemCount) {
   // Offsets that are not of type <N x i32> are sign extended by the
   // getelementptr instruction, and MVE gathers/scatters treat the offset as
   // unsigned. Thus, if the element size is smaller than 32, we can only allow
   // positive offsets - i.e., the offsets are not allowed to be variables we
   // can't look into.
   // Additionally, <N x i32> offsets have to either originate from a zext of a
   // vector with element types smaller or equal the type of the gather we're
   // looking at, or consist of constants that we can check are small enough
   // to fit into the gather type.
   // Thus we check that 0 < value < 2^TargetElemSize.
   unsigned TargetElemSize = 128 / TargetElemCount;
   unsigned OffsetElemSize = cast<FixedVectorType>(Offsets->getType())
                                 ->getElementType()
                                 ->getScalarSizeInBits();
   if (OffsetElemSize != TargetElemSize || OffsetElemSize != 32) {
     Constant *ConstOff = dyn_cast<Constant>(Offsets);
     if (!ConstOff)
       return false;
     int64_t TargetElemMaxSize = (1ULL << TargetElemSize);
     auto CheckValueSize = [TargetElemMaxSize](Value *OffsetElem) {
       ConstantInt *OConst = dyn_cast<ConstantInt>(OffsetElem);
       if (!OConst)
         return false;
       int SExtValue = OConst->getSExtValue();
       if (SExtValue >= TargetElemMaxSize || SExtValue < 0)
         return false;
       return true;
     };
     if (isa<FixedVectorType>(ConstOff->getType())) {
       for (unsigned i = 0; i < TargetElemCount; i++) {
         if (!CheckValueSize(ConstOff->getAggregateElement(i)))
           return false;
       }
     } else {
       if (!CheckValueSize(ConstOff))
         return false;
     }
   }
   return true;
 }

 Value *MVEGatherScatterLowering::decomposePtr(Value *Ptr, Value *&Offsets,
                                               int &Scale, FixedVectorType *Ty,
                                               Type *MemoryTy,
                                               IRBuilder<> &Builder) {
   if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
     if (Value *V = decomposeGEP(Offsets, Ty, GEP, Builder)) {
       Scale =
           computeScale(GEP->getSourceElementType()->getPrimitiveSizeInBits(),
                        MemoryTy->getScalarSizeInBits());
       return Scale == -1 ? nullptr : V;
     }
   }

   // If we couldn't use the GEP (or it doesn't exist), attempt to use a
   // BasePtr of 0 with Ptr as the Offsets, so long as there are only 4
   // elements.
   FixedVectorType *PtrTy = cast<FixedVectorType>(Ptr->getType());
   if (PtrTy->getNumElements() != 4 || MemoryTy->getScalarSizeInBits() == 32)
     return nullptr;
   Value *Zero = ConstantInt::get(Builder.getInt32Ty(), 0);
   Value *BasePtr = Builder.CreateIntToPtr(Zero, Builder.getInt8PtrTy());
   Offsets = Builder.CreatePtrToInt(
       Ptr, FixedVectorType::get(Builder.getInt32Ty(), 4));
   Scale = 0;
   return BasePtr;
 }

 Value *MVEGatherScatterLowering::decomposeGEP(Value *&Offsets,
                                               FixedVectorType *Ty,
                                               GetElementPtrInst *GEP,
                                               IRBuilder<> &Builder) {
   if (!GEP) {
     LLVM_DEBUG(dbgs() << "masked gathers/scatters: no getelementpointer "
                       << "found\n");
     return nullptr;
   }
   LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementpointer found."
                     << " Looking at intrinsic for base + vector of offsets\n");
   Value *GEPPtr = GEP->getPointerOperand();
   Offsets = GEP->getOperand(1);
   if (GEPPtr->getType()->isVectorTy() ||
       !isa<FixedVectorType>(Offsets->getType()))
     return nullptr;

   if (GEP->getNumOperands() != 2) {
     LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementptr with too many"
                       << " operands. Expanding.\n");
     return nullptr;
   }
   Offsets = GEP->getOperand(1);
   unsigned OffsetsElemCount =
       cast<FixedVectorType>(Offsets->getType())->getNumElements();
   // Paranoid check whether the number of parallel lanes is the same
   assert(Ty->getNumElements() == OffsetsElemCount);

   ZExtInst *ZextOffs = dyn_cast<ZExtInst>(Offsets);
   if (ZextOffs)
     Offsets = ZextOffs->getOperand(0);
   FixedVectorType *OffsetType = cast<FixedVectorType>(Offsets->getType());

   // If the offsets are already being zext-ed to <N x i32>, that relieves us of
   // having to make sure that they won't overflow.
   if (!ZextOffs || cast<FixedVectorType>(ZextOffs->getDestTy())
                            ->getElementType()
                            ->getScalarSizeInBits() != 32)
     if (!checkOffsetSize(Offsets, OffsetsElemCount))
       return nullptr;

   // The offset sizes have been checked; if any truncating or zext-ing is
   // required to fix them, do that now
   if (Ty != Offsets->getType()) {
     if ((Ty->getElementType()->getScalarSizeInBits() <
          OffsetType->getElementType()->getScalarSizeInBits())) {
       Offsets = Builder.CreateTrunc(Offsets, Ty);
     } else {
       Offsets = Builder.CreateZExt(Offsets, VectorType::getInteger(Ty));
     }
   }
   // If none of the checks failed, return the gep's base pointer
   LLVM_DEBUG(dbgs() << "masked gathers/scatters: found correct offsets\n");
   return GEPPtr;
 }

 void MVEGatherScatterLowering::lookThroughBitcast(Value *&Ptr) {
   // Look through bitcast instruction if #elements is the same
   if (auto *BitCast = dyn_cast<BitCastInst>(Ptr)) {
     auto *BCTy = cast<FixedVectorType>(BitCast->getType());
     auto *BCSrcTy = cast<FixedVectorType>(BitCast->getOperand(0)->getType());
     if (BCTy->getNumElements() == BCSrcTy->getNumElements()) {
       LLVM_DEBUG(dbgs() << "masked gathers/scatters: looking through "
                         << "bitcast\n");
       Ptr = BitCast->getOperand(0);
     }
   }
 }

 int MVEGatherScatterLowering::computeScale(unsigned GEPElemSize,
                                            unsigned MemoryElemSize) {
   // This can be a 32bit load/store scaled by 4, a 16bit load/store scaled by 2,
   // or a 8bit, 16bit or 32bit load/store scaled by 1
   if (GEPElemSize == 32 && MemoryElemSize == 32)
     return 2;
   else if (GEPElemSize == 16 && MemoryElemSize == 16)
     return 1;
   else if (GEPElemSize == 8)
     return 0;
   LLVM_DEBUG(dbgs() << "masked gathers/scatters: incorrect scale. Can't "
                     << "create intrinsic\n");
   return -1;
 }

 Optional<int64_t> MVEGatherScatterLowering::getIfConst(const Value *V) {
   const Constant *C = dyn_cast<Constant>(V);
   if (C && C->getSplatValue())
     return Optional<int64_t>{C->getUniqueInteger().getSExtValue()};
   if (!isa<Instruction>(V))
     return Optional<int64_t>{};

   const Instruction *I = cast<Instruction>(V);
   if (I->getOpcode() == Instruction::Add || I->getOpcode() == Instruction::Or ||
       I->getOpcode() == Instruction::Mul ||
       I->getOpcode() == Instruction::Shl) {
     Optional<int64_t> Op0 = getIfConst(I->getOperand(0));
     Optional<int64_t> Op1 = getIfConst(I->getOperand(1));
     if (!Op0 || !Op1)
       return Optional<int64_t>{};
     if (I->getOpcode() == Instruction::Add)
       return Optional<int64_t>{Op0.getValue() + Op1.getValue()};
     if (I->getOpcode() == Instruction::Mul)
       return Optional<int64_t>{Op0.getValue() * Op1.getValue()};
     if (I->getOpcode() == Instruction::Shl)
       return Optional<int64_t>{Op0.getValue() << Op1.getValue()};
     if (I->getOpcode() == Instruction::Or)
       return Optional<int64_t>{Op0.getValue() | Op1.getValue()};
   }
   return Optional<int64_t>{};
 }

 // Return true if I is an Or instruction that is equivalent to an add, due to
 // the operands having no common bits set.
 static bool isAddLikeOr(Instruction *I, const DataLayout &DL) {
   return I->getOpcode() == Instruction::Or &&
          haveNoCommonBitsSet(I->getOperand(0), I->getOperand(1), DL);
 }

 std::pair<Value *, int64_t>
 MVEGatherScatterLowering::getVarAndConst(Value *Inst, int TypeScale) {
   std::pair<Value *, int64_t> ReturnFalse =
       std::pair<Value *, int64_t>(nullptr, 0);
   // At this point, the instruction we're looking at must be an add or an
   // add-like-or.
   Instruction *Add = dyn_cast<Instruction>(Inst);
   if (Add == nullptr ||
       (Add->getOpcode() != Instruction::Add && !isAddLikeOr(Add, *DL)))
     return ReturnFalse;

   Value *Summand;
   Optional<int64_t> Const;
   // Find out which operand the value that is increased is
   if ((Const = getIfConst(Add->getOperand(0))))
     Summand = Add->getOperand(1);
   else if ((Const = getIfConst(Add->getOperand(1))))
     Summand = Add->getOperand(0);
   else
     return ReturnFalse;

   // Check that the constant is small enough for an incrementing gather
   int64_t Immediate = Const.getValue() << TypeScale;
   if (Immediate > 512 || Immediate < -512 || Immediate % 4 != 0)
     return ReturnFalse;

   return std::pair<Value *, int64_t>(Summand, Immediate);
 }

 Instruction *MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
   using namespace PatternMatch;
   LLVM_DEBUG(dbgs() << "masked gathers: checking transform preconditions\n"
                     << *I << "\n");

   // @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0)
   // Attempt to turn the masked gather in I into a MVE intrinsic
   // Potentially optimising the addressing modes as we do so.
   auto *Ty = cast<FixedVectorType>(I->getType());
   Value *Ptr = I->getArgOperand(0);
   Align Alignment = cast<ConstantInt>(I->getArgOperand(1))->getAlignValue();
   Value *Mask = I->getArgOperand(2);
   Value *PassThru = I->getArgOperand(3);

   if (!isLegalTypeAndAlignment(Ty->getNumElements(), Ty->getScalarSizeInBits(),
                                Alignment))
     return nullptr;
   lookThroughBitcast(Ptr);
   assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type");

   IRBuilder<> Builder(I->getContext());
   Builder.SetInsertPoint(I);
   Builder.SetCurrentDebugLocation(I->getDebugLoc());

   Instruction *Root = I;

   Instruction *Load = tryCreateIncrementingGatScat(I, Ptr, Builder);
   if (!Load)
     Load = tryCreateMaskedGatherOffset(I, Ptr, Root, Builder);
   if (!Load)
     Load = tryCreateMaskedGatherBase(I, Ptr, Builder);
   if (!Load)
     return nullptr;

   if (!isa<UndefValue>(PassThru) && !match(PassThru, m_Zero())) {
     LLVM_DEBUG(dbgs() << "masked gathers: found non-trivial passthru - "
                       << "creating select\n");
     Load = SelectInst::Create(Mask, Load, PassThru);
     Builder.Insert(Load);
   }

   Root->replaceAllUsesWith(Load);
   Root->eraseFromParent();
   if (Root != I)
     // If this was an extending gather, we need to get rid of the sext/zext
     // sext/zext as well as of the gather itself
     I->eraseFromParent();

   LLVM_DEBUG(dbgs() << "masked gathers: successfully built masked gather\n"
                     << *Load << "\n");
   return Load;
 }

 Instruction *MVEGatherScatterLowering::tryCreateMaskedGatherBase(
     IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) {
   using namespace PatternMatch;
   auto *Ty = cast<FixedVectorType>(I->getType());
   LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n");
   if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
     // Can't build an intrinsic for this
     return nullptr;
   Value *Mask = I->getArgOperand(2);
   if (match(Mask, m_One()))
     return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base,
                                    {Ty, Ptr->getType()},
                                    {Ptr, Builder.getInt32(Increment)});
   else
     return Builder.CreateIntrinsic(
         Intrinsic::arm_mve_vldr_gather_base_predicated,
         {Ty, Ptr->getType(), Mask->getType()},
         {Ptr, Builder.getInt32(Increment), Mask});
 }

 Instruction *MVEGatherScatterLowering::tryCreateMaskedGatherBaseWB(
     IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) {
   using namespace PatternMatch;
   auto *Ty = cast<FixedVectorType>(I->getType());
   LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers with "
                     << "writeback\n");
   if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
     // Can't build an intrinsic for this
     return nullptr;
   Value *Mask = I->getArgOperand(2);
   if (match(Mask, m_One()))
     return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base_wb,
                                    {Ty, Ptr->getType()},
                                    {Ptr, Builder.getInt32(Increment)});
   else
     return Builder.CreateIntrinsic(
         Intrinsic::arm_mve_vldr_gather_base_wb_predicated,
         {Ty, Ptr->getType(), Mask->getType()},
         {Ptr, Builder.getInt32(Increment), Mask});
 }

 Instruction *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
     IntrinsicInst *I, Value *Ptr, Instruction *&Root, IRBuilder<> &Builder) {
   using namespace PatternMatch;

   Type *MemoryTy = I->getType();
   Type *ResultTy = MemoryTy;

   unsigned Unsigned = 1;
   // The size of the gather was already checked in isLegalTypeAndAlignment;
   // if it was not a full vector width an appropriate extend should follow.
   auto *Extend = Root;
   bool TruncResult = false;
   if (MemoryTy->getPrimitiveSizeInBits() < 128) {
     if (I->hasOneUse()) {
       // If the gather has a single extend of the correct type, use an extending
       // gather and replace the ext. In which case the correct root to replace
       // is not the CallInst itself, but the instruction which extends it.
       Instruction* User = cast<Instruction>(*I->users().begin());
       if (isa<SExtInst>(User) &&
           User->getType()->getPrimitiveSizeInBits() == 128) {
         LLVM_DEBUG(dbgs() << "masked gathers: Incorporating extend: "
                           << *User << "\n");
         Extend = User;
         ResultTy = User->getType();
         Unsigned = 0;
       } else if (isa<ZExtInst>(User) &&
                  User->getType()->getPrimitiveSizeInBits() == 128) {
         LLVM_DEBUG(dbgs() << "masked gathers: Incorporating extend: "
                           << *ResultTy << "\n");
         Extend = User;
         ResultTy = User->getType();
       }
     }

     // If an extend hasn't been found and the type is an integer, create an
     // extending gather and truncate back to the original type.
     if (ResultTy->getPrimitiveSizeInBits() < 128 &&
         ResultTy->isIntOrIntVectorTy()) {
       ResultTy = ResultTy->getWithNewBitWidth(
           128 / cast<FixedVectorType>(ResultTy)->getNumElements());
       TruncResult = true;
       LLVM_DEBUG(dbgs() << "masked gathers: Small input type, truncing to: "
                         << *ResultTy << "\n");
     }

     // The final size of the gather must be a full vector width
     if (ResultTy->getPrimitiveSizeInBits() != 128) {
       LLVM_DEBUG(dbgs() << "masked gathers: Extend needed but not provided "
                            "from the correct type. Expanding\n");
       return nullptr;
     }
   }

   Value *Offsets;
   int Scale;
   Value *BasePtr = decomposePtr(
       Ptr, Offsets, Scale, cast<FixedVectorType>(ResultTy), MemoryTy, Builder);
   if (!BasePtr)
     return nullptr;

   Root = Extend;
   Value *Mask = I->getArgOperand(2);
   Instruction *Load = nullptr;
   if (!match(Mask, m_One()))
     Load = Builder.CreateIntrinsic(
         Intrinsic::arm_mve_vldr_gather_offset_predicated,
         {ResultTy, BasePtr->getType(), Offsets->getType(), Mask->getType()},
         {BasePtr, Offsets, Builder.getInt32(MemoryTy->getScalarSizeInBits()),
          Builder.getInt32(Scale), Builder.getInt32(Unsigned), Mask});
   else
     Load = Builder.CreateIntrinsic(
         Intrinsic::arm_mve_vldr_gather_offset,
         {ResultTy, BasePtr->getType(), Offsets->getType()},
         {BasePtr, Offsets, Builder.getInt32(MemoryTy->getScalarSizeInBits()),
          Builder.getInt32(Scale), Builder.getInt32(Unsigned)});

   if (TruncResult) {
     Load = TruncInst::Create(Instruction::Trunc, Load, MemoryTy);
     Builder.Insert(Load);
   }
   return Load;
 }

 Instruction *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) {
   using namespace PatternMatch;
   LLVM_DEBUG(dbgs() << "masked scatters: checking transform preconditions\n"
                     << *I << "\n");

   // @llvm.masked.scatter.*(data, ptrs, alignment, mask)
   // Attempt to turn the masked scatter in I into a MVE intrinsic
   // Potentially optimising the addressing modes as we do so.
   Value *Input = I->getArgOperand(0);
   Value *Ptr = I->getArgOperand(1);
   Align Alignment = cast<ConstantInt>(I->getArgOperand(2))->getAlignValue();
   auto *Ty = cast<FixedVectorType>(Input->getType());

   if (!isLegalTypeAndAlignment(Ty->getNumElements(), Ty->getScalarSizeInBits(),
                                Alignment))
     return nullptr;

   lookThroughBitcast(Ptr);
   assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type");

   IRBuilder<> Builder(I->getContext());
   Builder.SetInsertPoint(I);
   Builder.SetCurrentDebugLocation(I->getDebugLoc());

   Instruction *Store = tryCreateIncrementingGatScat(I, Ptr, Builder);
   if (!Store)
     Store = tryCreateMaskedScatterOffset(I, Ptr, Builder);
   if (!Store)
     Store = tryCreateMaskedScatterBase(I, Ptr, Builder);
   if (!Store)
     return nullptr;

   LLVM_DEBUG(dbgs() << "masked scatters: successfully built masked scatter\n"
                     << *Store << "\n");
   I->eraseFromParent();
   return Store;
 }

 Instruction *MVEGatherScatterLowering::tryCreateMaskedScatterBase(
     IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) {
   using namespace PatternMatch;
   Value *Input = I->getArgOperand(0);
   auto *Ty = cast<FixedVectorType>(Input->getType());
   // Only QR variants allow truncating
   if (!(Ty->getNumElements() == 4 && Ty->getScalarSizeInBits() == 32)) {
     // Can't build an intrinsic for this
     return nullptr;
   }
   Value *Mask = I->getArgOperand(3);
   //  int_arm_mve_vstr_scatter_base(_predicated) addr, offset, data(, mask)
   LLVM_DEBUG(dbgs() << "masked scatters: storing to a vector of pointers\n");
   if (match(Mask, m_One()))
     return Builder.CreateIntrinsic(Intrinsic::arm_mve_vstr_scatter_base,
                                    {Ptr->getType(), Input->getType()},
                                    {Ptr, Builder.getInt32(Increment), Input});
   else
     return Builder.CreateIntrinsic(
         Intrinsic::arm_mve_vstr_scatter_base_predicated,
         {Ptr->getType(), Input->getType(), Mask->getType()},
         {Ptr, Builder.getInt32(Increment), Input, Mask});
 }

 Instruction *MVEGatherScatterLowering::tryCreateMaskedScatterBaseWB(
     IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) {
   using namespace PatternMatch;
   Value *Input = I->getArgOperand(0);
   auto *Ty = cast<FixedVectorType>(Input->getType());
   LLVM_DEBUG(dbgs() << "masked scatters: storing to a vector of pointers "
                     << "with writeback\n");
   if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
     // Can't build an intrinsic for this
     return nullptr;
   Value *Mask = I->getArgOperand(3);
   if (match(Mask, m_One()))
     return Builder.CreateIntrinsic(Intrinsic::arm_mve_vstr_scatter_base_wb,
                                    {Ptr->getType(), Input->getType()},
                                    {Ptr, Builder.getInt32(Increment), Input});
   else
     return Builder.CreateIntrinsic(
         Intrinsic::arm_mve_vstr_scatter_base_wb_predicated,
         {Ptr->getType(), Input->getType(), Mask->getType()},
         {Ptr, Builder.getInt32(Increment), Input, Mask});
 }

 Instruction *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
     IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder) {
   using namespace PatternMatch;
   Value *Input = I->getArgOperand(0);
   Value *Mask = I->getArgOperand(3);
   Type *InputTy = Input->getType();
   Type *MemoryTy = InputTy;

   LLVM_DEBUG(dbgs() << "masked scatters: getelementpointer found. Storing"
                     << " to base + vector of offsets\n");
   // If the input has been truncated, try to integrate that trunc into the
   // scatter instruction (we don't care about alignment here)
   if (TruncInst *Trunc = dyn_cast<TruncInst>(Input)) {
     Value *PreTrunc = Trunc->getOperand(0);
     Type *PreTruncTy = PreTrunc->getType();
     if (PreTruncTy->getPrimitiveSizeInBits() == 128) {
       Input = PreTrunc;
       InputTy = PreTruncTy;
     }
   }
   bool ExtendInput = false;
   if (InputTy->getPrimitiveSizeInBits() < 128 &&
       InputTy->isIntOrIntVectorTy()) {
     // If we can't find a trunc to incorporate into the instruction, create an
     // implicit one with a zext, so that we can still create a scatter. We know
     // that the input type is 4x/8x/16x and of type i8/i16/i32, so any type
     // smaller than 128 bits will divide evenly into a 128bit vector.
     InputTy = InputTy->getWithNewBitWidth(
         128 / cast<FixedVectorType>(InputTy)->getNumElements());
     ExtendInput = true;
     LLVM_DEBUG(dbgs() << "masked scatters: Small input type, will extend:\n"
                       << *Input << "\n");
   }
   if (InputTy->getPrimitiveSizeInBits() != 128) {
     LLVM_DEBUG(dbgs() << "masked scatters: cannot create scatters for "
                          "non-standard input types. Expanding.\n");
     return nullptr;
   }

   Value *Offsets;
   int Scale;
   Value *BasePtr = decomposePtr(
       Ptr, Offsets, Scale, cast<FixedVectorType>(InputTy), MemoryTy, Builder);
   if (!BasePtr)
     return nullptr;

   if (ExtendInput)
     Input = Builder.CreateZExt(Input, InputTy);
   if (!match(Mask, m_One()))
     return Builder.CreateIntrinsic(
         Intrinsic::arm_mve_vstr_scatter_offset_predicated,
         {BasePtr->getType(), Offsets->getType(), Input->getType(),
          Mask->getType()},
         {BasePtr, Offsets, Input,
          Builder.getInt32(MemoryTy->getScalarSizeInBits()),
          Builder.getInt32(Scale), Mask});
   else
     return Builder.CreateIntrinsic(
         Intrinsic::arm_mve_vstr_scatter_offset,
         {BasePtr->getType(), Offsets->getType(), Input->getType()},
         {BasePtr, Offsets, Input,
          Builder.getInt32(MemoryTy->getScalarSizeInBits()),
          Builder.getInt32(Scale)});
 }

 Instruction *MVEGatherScatterLowering::tryCreateIncrementingGatScat(
     IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder) {
   FixedVectorType *Ty;
   if (I->getIntrinsicID() == Intrinsic::masked_gather)
     Ty = cast<FixedVectorType>(I->getType());
   else
     Ty = cast<FixedVectorType>(I->getArgOperand(0)->getType());

   // Incrementing gathers only exist for v4i32
   if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
     return nullptr;
   // Incrementing gathers are not beneficial outside of a loop
   Loop *L = LI->getLoopFor(I->getParent());
   if (L == nullptr)
     return nullptr;

   // Decompose the GEP into Base and Offsets
   GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
   Value *Offsets;
   Value *BasePtr = decomposeGEP(Offsets, Ty, GEP, Builder);
   if (!BasePtr)
     return nullptr;

   LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing "
                        "wb gather/scatter\n");

   // The gep was in charge of making sure the offsets are scaled correctly
   // - calculate that factor so it can be applied by hand
   int TypeScale =
       computeScale(DL->getTypeSizeInBits(GEP->getOperand(0)->getType()),
                    DL->getTypeSizeInBits(GEP->getType()) /
                        cast<FixedVectorType>(GEP->getType())->getNumElements());
   if (TypeScale == -1)
     return nullptr;

   if (GEP->hasOneUse()) {
     // Only in this case do we want to build a wb gather, because the wb will
     // change the phi which does affect other users of the gep (which will still
     // be using the phi in the old way)
     if (auto *Load = tryCreateIncrementingWBGatScat(I, BasePtr, Offsets,
                                                     TypeScale, Builder))
       return Load;
   }

   LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing "
                        "non-wb gather/scatter\n");

   std::pair<Value *, int64_t> Add = getVarAndConst(Offsets, TypeScale);
   if (Add.first == nullptr)
     return nullptr;
   Value *OffsetsIncoming = Add.first;
   int64_t Immediate = Add.second;

   // Make sure the offsets are scaled correctly
   Instruction *ScaledOffsets = BinaryOperator::Create(
       Instruction::Shl, OffsetsIncoming,
       Builder.CreateVectorSplat(Ty->getNumElements(), Builder.getInt32(TypeScale)),
       "ScaledIndex", I);
   // Add the base to the offsets
   OffsetsIncoming = BinaryOperator::Create(
       Instruction::Add, ScaledOffsets,
       Builder.CreateVectorSplat(
           Ty->getNumElements(),
           Builder.CreatePtrToInt(
               BasePtr,
               cast<VectorType>(ScaledOffsets->getType())->getElementType())),
       "StartIndex", I);

   if (I->getIntrinsicID() == Intrinsic::masked_gather)
     return tryCreateMaskedGatherBase(I, OffsetsIncoming, Builder, Immediate);
   else
     return tryCreateMaskedScatterBase(I, OffsetsIncoming, Builder, Immediate);
 }

 Instruction *MVEGatherScatterLowering::tryCreateIncrementingWBGatScat(
     IntrinsicInst *I, Value *BasePtr, Value *Offsets, unsigned TypeScale,
     IRBuilder<> &Builder) {
   // Check whether this gather's offset is incremented by a constant - if so,
   // and the load is of the right type, we can merge this into a QI gather
   Loop *L = LI->getLoopFor(I->getParent());
   // Offsets that are worth merging into this instruction will be incremented
   // by a constant, thus we're looking for an add of a phi and a constant
   PHINode *Phi = dyn_cast<PHINode>(Offsets);
   if (Phi == nullptr || Phi->getNumIncomingValues() != 2 ||
       Phi->getParent() != L->getHeader() || Phi->getNumUses() != 2)
     // No phi means no IV to write back to; if there is a phi, we expect it
     // to have exactly two incoming values; the only phis we are interested in
     // will be loop IV's and have exactly two uses, one in their increment and
     // one in the gather's gep
     return nullptr;

   unsigned IncrementIndex =
       Phi->getIncomingBlock(0) == L->getLoopLatch() ? 0 : 1;
   // Look through the phi to the phi increment
   Offsets = Phi->getIncomingValue(IncrementIndex);

   std::pair<Value *, int64_t> Add = getVarAndConst(Offsets, TypeScale);
   if (Add.first == nullptr)
     return nullptr;
   Value *OffsetsIncoming = Add.first;
   int64_t Immediate = Add.second;
   if (OffsetsIncoming != Phi)
     // Then the increment we are looking at is not an increment of the
     // induction variable, and we don't want to do a writeback
     return nullptr;

   Builder.SetInsertPoint(&Phi->getIncomingBlock(1 - IncrementIndex)->back());
   unsigned NumElems =
       cast<FixedVectorType>(OffsetsIncoming->getType())->getNumElements();

   // Make sure the offsets are scaled correctly
   Instruction *ScaledOffsets = BinaryOperator::Create(
       Instruction::Shl, Phi->getIncomingValue(1 - IncrementIndex),
       Builder.CreateVectorSplat(NumElems, Builder.getInt32(TypeScale)),
       "ScaledIndex", &Phi->getIncomingBlock(1 - IncrementIndex)->back());
   // Add the base to the offsets
   OffsetsIncoming = BinaryOperator::Create(
       Instruction::Add, ScaledOffsets,
       Builder.CreateVectorSplat(
           NumElems,
           Builder.CreatePtrToInt(
               BasePtr,
               cast<VectorType>(ScaledOffsets->getType())->getElementType())),
       "StartIndex", &Phi->getIncomingBlock(1 - IncrementIndex)->back());
   // The gather is pre-incrementing
   OffsetsIncoming = BinaryOperator::Create(
       Instruction::Sub, OffsetsIncoming,
       Builder.CreateVectorSplat(NumElems, Builder.getInt32(Immediate)),
       "PreIncrementStartIndex",
       &Phi->getIncomingBlock(1 - IncrementIndex)->back());
   Phi->setIncomingValue(1 - IncrementIndex, OffsetsIncoming);

   Builder.SetInsertPoint(I);

   Instruction *EndResult;
   Instruction *NewInduction;
   if (I->getIntrinsicID() == Intrinsic::masked_gather) {
     // Build the incrementing gather
     Value *Load = tryCreateMaskedGatherBaseWB(I, Phi, Builder, Immediate);
     // One value to be handed to whoever uses the gather, one is the loop
     // increment
     EndResult = ExtractValueInst::Create(Load, 0, "Gather");
     NewInduction = ExtractValueInst::Create(Load, 1, "GatherIncrement");
     Builder.Insert(EndResult);
     Builder.Insert(NewInduction);
   } else {
     // Build the incrementing scatter
     EndResult = NewInduction =
         tryCreateMaskedScatterBaseWB(I, Phi, Builder, Immediate);
   }
   Instruction *AddInst = cast<Instruction>(Offsets);
   AddInst->replaceAllUsesWith(NewInduction);
   AddInst->eraseFromParent();
   Phi->setIncomingValue(IncrementIndex, NewInduction);

   return EndResult;
 }

 void MVEGatherScatterLowering::pushOutAdd(PHINode *&Phi,
                                           Value *OffsSecondOperand,
                                           unsigned StartIndex) {
   LLVM_DEBUG(dbgs() << "masked gathers/scatters: optimising add instruction\n");
   Instruction *InsertionPoint =
         &cast<Instruction>(Phi->getIncomingBlock(StartIndex)->back());
   // Initialize the phi with a vector that contains a sum of the constants
   Instruction *NewIndex = BinaryOperator::Create(
       Instruction::Add, Phi->getIncomingValue(StartIndex), OffsSecondOperand,
       "PushedOutAdd", InsertionPoint);
   unsigned IncrementIndex = StartIndex == 0 ? 1 : 0;

   // Order such that start index comes first (this reduces mov's)
   Phi->addIncoming(NewIndex, Phi->getIncomingBlock(StartIndex));
   Phi->addIncoming(Phi->getIncomingValue(IncrementIndex),
                    Phi->getIncomingBlock(IncrementIndex));
   Phi->removeIncomingValue(IncrementIndex);
   Phi->removeIncomingValue(StartIndex);
 }

 void MVEGatherScatterLowering::pushOutMulShl(unsigned Opcode, PHINode *&Phi,
                                              Value *IncrementPerRound,
                                              Value *OffsSecondOperand,
                                              unsigned LoopIncrement,
                                              IRBuilder<> &Builder) {
   LLVM_DEBUG(dbgs() << "masked gathers/scatters: optimising mul instruction\n");

   // Create a new scalar add outside of the loop and transform it to a splat
   // by which loop variable can be incremented
   Instruction *InsertionPoint = &cast<Instruction>(
         Phi->getIncomingBlock(LoopIncrement == 1 ? 0 : 1)->back());

   // Create a new index
   Value *StartIndex =
       BinaryOperator::Create((Instruction::BinaryOps)Opcode,
                              Phi->getIncomingValue(LoopIncrement == 1 ? 0 : 1),
                              OffsSecondOperand, "PushedOutMul", InsertionPoint);

   Instruction *Product =
       BinaryOperator::Create((Instruction::BinaryOps)Opcode, IncrementPerRound,
                              OffsSecondOperand, "Product", InsertionPoint);
   // Increment NewIndex by Product instead of the multiplication
   Instruction *NewIncrement = BinaryOperator::Create(
       Instruction::Add, Phi, Product, "IncrementPushedOutMul",
       cast<Instruction>(Phi->getIncomingBlock(LoopIncrement)->back())
           .getPrevNode());

   Phi->addIncoming(StartIndex,
                    Phi->getIncomingBlock(LoopIncrement == 1 ? 0 : 1));
   Phi->addIncoming(NewIncrement, Phi->getIncomingBlock(LoopIncrement));
   Phi->removeIncomingValue((unsigned)0);
   Phi->removeIncomingValue((unsigned)0);
 }

 // Check whether all usages of this instruction are as offsets of
 // gathers/scatters or simple arithmetics only used by gathers/scatters
 static bool hasAllGatScatUsers(Instruction *I, const DataLayout &DL) {
   if (I->hasNUses(0)) {
     return false;
   }
   bool Gatscat = true;
   for (User *U : I->users()) {
     if (!isa<Instruction>(U))
       return false;
     if (isa<GetElementPtrInst>(U) ||
         isGatherScatter(dyn_cast<IntrinsicInst>(U))) {
       return Gatscat;
     } else {
       unsigned OpCode = cast<Instruction>(U)->getOpcode();
       if ((OpCode == Instruction::Add || OpCode == Instruction::Mul ||
            OpCode == Instruction::Shl ||
            isAddLikeOr(cast<Instruction>(U), DL)) &&
           hasAllGatScatUsers(cast<Instruction>(U), DL)) {
         continue;
       }
       return false;
     }
   }
   return Gatscat;
 }

 bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
                                                LoopInfo *LI) {
   LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to optimize\n"
                     << *Offsets << "\n");
   // Optimise the addresses of gathers/scatters by moving invariant
   // calculations out of the loop
   if (!isa<Instruction>(Offsets))
     return false;
   Instruction *Offs = cast<Instruction>(Offsets);
   if (Offs->getOpcode() != Instruction::Add && !isAddLikeOr(Offs, *DL) &&
       Offs->getOpcode() != Instruction::Mul &&
       Offs->getOpcode() != Instruction::Shl)
     return false;
   Loop *L = LI->getLoopFor(BB);
   if (L == nullptr)
     return false;
   if (!Offs->hasOneUse()) {
     if (!hasAllGatScatUsers(Offs, *DL))
       return false;
   }

   // Find out which, if any, operand of the instruction
   // is a phi node
   PHINode *Phi;
   int OffsSecondOp;
   if (isa<PHINode>(Offs->getOperand(0))) {
     Phi = cast<PHINode>(Offs->getOperand(0));
     OffsSecondOp = 1;
   } else if (isa<PHINode>(Offs->getOperand(1))) {
     Phi = cast<PHINode>(Offs->getOperand(1));
     OffsSecondOp = 0;
   } else {
     bool Changed = false;
     if (isa<Instruction>(Offs->getOperand(0)) &&
         L->contains(cast<Instruction>(Offs->getOperand(0))))
       Changed |= optimiseOffsets(Offs->getOperand(0), BB, LI);
     if (isa<Instruction>(Offs->getOperand(1)) &&
         L->contains(cast<Instruction>(Offs->getOperand(1))))
       Changed |= optimiseOffsets(Offs->getOperand(1), BB, LI);
     if (!Changed)
       return false;
     if (isa<PHINode>(Offs->getOperand(0))) {
       Phi = cast<PHINode>(Offs->getOperand(0));
       OffsSecondOp = 1;
     } else if (isa<PHINode>(Offs->getOperand(1))) {
       Phi = cast<PHINode>(Offs->getOperand(1));
       OffsSecondOp = 0;
     } else {
       return false;
     }
   }
   // A phi node we want to perform this function on should be from the
   // loop header.
   if (Phi->getParent() != L->getHeader())
     return false;

   // We're looking for a simple add recurrence.
   BinaryOperator *IncInstruction;
   Value *Start, *IncrementPerRound;
   if (!matchSimpleRecurrence(Phi, IncInstruction, Start, IncrementPerRound) ||
       IncInstruction->getOpcode() != Instruction::Add)
     return false;

   int IncrementingBlock = Phi->getIncomingValue(0) == IncInstruction ? 0 : 1;

   // Get the value that is added to/multiplied with the phi
   Value *OffsSecondOperand = Offs->getOperand(OffsSecondOp);

   if (IncrementPerRound->getType() != OffsSecondOperand->getType() ||
       !L->isLoopInvariant(OffsSecondOperand))
     // Something has gone wrong, abort
     return false;

   // Only proceed if the increment per round is a constant or an instruction
   // which does not originate from within the loop
   if (!isa<Constant>(IncrementPerRound) &&
       !(isa<Instruction>(IncrementPerRound) &&
         !L->contains(cast<Instruction>(IncrementPerRound))))
     return false;

   // If the phi is not used by anything else, we can just adapt it when
   // replacing the instruction; if it is, we'll have to duplicate it
   PHINode *NewPhi;
   if (Phi->getNumUses() == 2) {
     // No other users -> reuse existing phi (One user is the instruction
     // we're looking at, the other is the phi increment)
     if (IncInstruction->getNumUses() != 1) {
       // If the incrementing instruction does have more users than
       // our phi, we need to copy it
       IncInstruction = BinaryOperator::Create(
           Instruction::BinaryOps(IncInstruction->getOpcode()), Phi,
           IncrementPerRound, "LoopIncrement", IncInstruction);
       Phi->setIncomingValue(IncrementingBlock, IncInstruction);
     }
     NewPhi = Phi;
   } else {
     // There are other users -> create a new phi
     NewPhi = PHINode::Create(Phi->getType(), 2, "NewPhi", Phi);
     // Copy the incoming values of the old phi
     NewPhi->addIncoming(Phi->getIncomingValue(IncrementingBlock == 1 ? 0 : 1),
                         Phi->getIncomingBlock(IncrementingBlock == 1 ? 0 : 1));
     IncInstruction = BinaryOperator::Create(
         Instruction::BinaryOps(IncInstruction->getOpcode()), NewPhi,
         IncrementPerRound, "LoopIncrement", IncInstruction);
     NewPhi->addIncoming(IncInstruction,
                         Phi->getIncomingBlock(IncrementingBlock));
     IncrementingBlock = 1;
   }

   IRBuilder<> Builder(BB->getContext());
   Builder.SetInsertPoint(Phi);
   Builder.SetCurrentDebugLocation(Offs->getDebugLoc());

   switch (Offs->getOpcode()) {
   case Instruction::Add:
   case Instruction::Or:
     pushOutAdd(NewPhi, OffsSecondOperand, IncrementingBlock == 1 ? 0 : 1);
     break;
   case Instruction::Mul:
   case Instruction::Shl:
     pushOutMulShl(Offs->getOpcode(), NewPhi, IncrementPerRound,
                   OffsSecondOperand, IncrementingBlock, Builder);
     break;
   default:
     return false;
   }
   LLVM_DEBUG(dbgs() << "masked gathers/scatters: simplified loop variable "
                     << "add/mul\n");

   // The instruction has now been "absorbed" into the phi value
   Offs->replaceAllUsesWith(NewPhi);
   if (Offs->hasNUses(0))
     Offs->eraseFromParent();
   // Clean up the old increment in case it's unused because we built a new
   // one
   if (IncInstruction->hasNUses(0))
     IncInstruction->eraseFromParent();

   return true;
 }

 static Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP,
                                       IRBuilder<> &Builder) {
   // Splat the non-vector value to a vector of the given type - if the value is
   // a constant (and its value isn't too big), we can even use this opportunity
   // to scale it to the size of the vector elements
   auto FixSummands = [&Builder](FixedVectorType *&VT, Value *&NonVectorVal) {
     ConstantInt *Const;
     if ((Const = dyn_cast<ConstantInt>(NonVectorVal)) &&
         VT->getElementType() != NonVectorVal->getType()) {
       unsigned TargetElemSize = VT->getElementType()->getPrimitiveSizeInBits();
       uint64_t N = Const->getZExtValue();
       if (N < (unsigned)(1 << (TargetElemSize - 1))) {
         NonVectorVal = Builder.CreateVectorSplat(
             VT->getNumElements(), Builder.getIntN(TargetElemSize, N));
         return;
       }
     }
     NonVectorVal =
         Builder.CreateVectorSplat(VT->getNumElements(), NonVectorVal);
   };

   FixedVectorType *XElType = dyn_cast<FixedVectorType>(X->getType());
   FixedVectorType *YElType = dyn_cast<FixedVectorType>(Y->getType());
   // If one of X, Y is not a vector, we have to splat it in order
   // to add the two of them.
   if (XElType && !YElType) {
     FixSummands(XElType, Y);
     YElType = cast<FixedVectorType>(Y->getType());
   } else if (YElType && !XElType) {
     FixSummands(YElType, X);
     XElType = cast<FixedVectorType>(X->getType());
   }
   assert(XElType && YElType && "Unknown vector types");
   // Check that the summands are of compatible types
   if (XElType != YElType) {
     LLVM_DEBUG(dbgs() << "masked gathers/scatters: incompatible gep offsets\n");
     return nullptr;
   }

   if (XElType->getElementType()->getScalarSizeInBits() != 32) {
     // Check that by adding the vectors we do not accidentally
     // create an overflow
     Constant *ConstX = dyn_cast<Constant>(X);
     Constant *ConstY = dyn_cast<Constant>(Y);
     if (!ConstX || !ConstY)
       return nullptr;
     unsigned TargetElemSize = 128 / XElType->getNumElements();
     for (unsigned i = 0; i < XElType->getNumElements(); i++) {
       ConstantInt *ConstXEl =
           dyn_cast<ConstantInt>(ConstX->getAggregateElement(i));
       ConstantInt *ConstYEl =
           dyn_cast<ConstantInt>(ConstY->getAggregateElement(i));
       if (!ConstXEl || !ConstYEl ||
           ConstXEl->getZExtValue() + ConstYEl->getZExtValue() >=
               (unsigned)(1 << (TargetElemSize - 1)))
         return nullptr;
     }
   }

   Value *Add = Builder.CreateAdd(X, Y);

   FixedVectorType *GEPType = cast<FixedVectorType>(GEP->getType());
   if (checkOffsetSize(Add, GEPType->getNumElements()))
     return Add;
   else
     return nullptr;
 }

 Value *MVEGatherScatterLowering::foldGEP(GetElementPtrInst *GEP,
                                          Value *&Offsets,
                                          IRBuilder<> &Builder) {
   Value *GEPPtr = GEP->getPointerOperand();
   Offsets = GEP->getOperand(1);
   // We only merge geps with constant offsets, because only for those
   // we can make sure that we do not cause an overflow
   if (!isa<Constant>(Offsets))
     return nullptr;
   GetElementPtrInst *BaseGEP;
   if ((BaseGEP = dyn_cast<GetElementPtrInst>(GEPPtr))) {
     // Merge the two geps into one
     Value *BaseBasePtr = foldGEP(BaseGEP, Offsets, Builder);
     if (!BaseBasePtr)
       return nullptr;
     Offsets =
         CheckAndCreateOffsetAdd(Offsets, GEP->getOperand(1), GEP, Builder);
     if (Offsets == nullptr)
       return nullptr;
     return BaseBasePtr;
   }
   return GEPPtr;
 }

 bool MVEGatherScatterLowering::optimiseAddress(Value *Address, BasicBlock *BB,
                                                LoopInfo *LI) {
   GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Address);
   if (!GEP)
     return false;
   bool Changed = false;
   if (GEP->hasOneUse() && isa<GetElementPtrInst>(GEP->getPointerOperand())) {
     IRBuilder<> Builder(GEP->getContext());
     Builder.SetInsertPoint(GEP);
     Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
     Value *Offsets;
     Value *Base = foldGEP(GEP, Offsets, Builder);
     // We only want to merge the geps if there is a real chance that they can be
     // used by an MVE gather; thus the offset has to have the correct size
     // (always i32 if it is not of vector type) and the base has to be a
     // pointer.
     if (Offsets && Base && Base != GEP) {
       GetElementPtrInst *NewAddress = GetElementPtrInst::Create(
           GEP->getSourceElementType(), Base, Offsets, "gep.merged", GEP);
       GEP->replaceAllUsesWith(NewAddress);
       GEP = NewAddress;
       Changed = true;
     }
   }
   Changed |= optimiseOffsets(GEP->getOperand(1), GEP->getParent(), LI);
   return Changed;
 }

 bool MVEGatherScatterLowering::runOnFunction(Function &F) {
   if (!EnableMaskedGatherScatters)
     return false;
   auto &TPC = getAnalysis<TargetPassConfig>();
   auto &TM = TPC.getTM<TargetMachine>();
   auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
   if (!ST->hasMVEIntegerOps())
     return false;
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   DL = &F.getParent()->getDataLayout();
   SmallVector<IntrinsicInst *, 4> Gathers;
   SmallVector<IntrinsicInst *, 4> Scatters;

   bool Changed = false;

   for (BasicBlock &BB : F) {
     Changed |= SimplifyInstructionsInBlock(&BB);

     for (Instruction &I : BB) {
       IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
       if (II && II->getIntrinsicID() == Intrinsic::masked_gather &&
           isa<FixedVectorType>(II->getType())) {
         Gathers.push_back(II);
         Changed |= optimiseAddress(II->getArgOperand(0), II->getParent(), LI);
       } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter &&
                  isa<FixedVectorType>(II->getArgOperand(0)->getType())) {
         Scatters.push_back(II);
         Changed |= optimiseAddress(II->getArgOperand(1), II->getParent(), LI);
       }
     }
   }
   for (unsigned i = 0; i < Gathers.size(); i++) {
     IntrinsicInst *I = Gathers[i];
     Instruction *L = lowerGather(I);
     if (L == nullptr)
       continue;

     // Get rid of any now dead instructions
     SimplifyInstructionsInBlock(L->getParent());
     Changed = true;
   }

   for (unsigned i = 0; i < Scatters.size(); i++) {
     IntrinsicInst *I = Scatters[i];
     Instruction *S = lowerScatter(I);
     if (S == nullptr)
       continue;

     // Get rid of any now dead instructions
     SimplifyInstructionsInBlock(S->getParent());
     Changed = true;
   }
   return Changed;
 }