lib/Transforms/Vectorize/VPlanRecipes.cpp - llvm-project/llvm - Git at Google

 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 ///
 /// \file
 /// This file contains implementations for different VPlan recipes.
 ///
 //===----------------------------------------------------------------------===//

 #include "LoopVectorizationPlanner.h"
 #include "VPlan.h"
 #include "VPlanAnalysis.h"
 #include "VPlanHelpers.h"
 #include "VPlanPatternMatch.h"
 #include "VPlanUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
 #include <cassert>

 using namespace llvm;

 using VectorParts = SmallVector<Value *, 2>;

 #define LV_NAME "loop-vectorize"
 #define DEBUG_TYPE LV_NAME

 bool VPRecipeBase::mayWriteToMemory() const {
   switch (getVPDefID()) {
   case VPExpressionSC:
     return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
   case VPInstructionSC:
     return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
   case VPInterleaveEVLSC:
   case VPInterleaveSC:
     return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
   case VPWidenStoreEVLSC:
   case VPWidenStoreSC:
     return true;
   case VPReplicateSC:
     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
         ->mayWriteToMemory();
   case VPWidenCallSC:
     return !cast<VPWidenCallRecipe>(this)
                 ->getCalledScalarFunction()
                 ->onlyReadsMemory();
   case VPWidenIntrinsicSC:
     return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
   case VPCanonicalIVPHISC:
   case VPBranchOnMaskSC:
   case VPFirstOrderRecurrencePHISC:
   case VPReductionPHISC:
   case VPScalarIVStepsSC:
   case VPPredInstPHISC:
     return false;
   case VPBlendSC:
   case VPReductionEVLSC:
   case VPReductionSC:
   case VPVectorPointerSC:
   case VPWidenCanonicalIVSC:
   case VPWidenCastSC:
   case VPWidenGEPSC:
   case VPWidenIntOrFpInductionSC:
   case VPWidenLoadEVLSC:
   case VPWidenLoadSC:
   case VPWidenPHISC:
   case VPWidenSC:
   case VPWidenSelectSC: {
     const Instruction *I =
         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
     (void)I;
     assert((!I || !I->mayWriteToMemory()) &&
            "underlying instruction may write to memory");
     return false;
   }
   default:
     return true;
   }
 }

 bool VPRecipeBase::mayReadFromMemory() const {
   switch (getVPDefID()) {
   case VPExpressionSC:
     return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
   case VPInstructionSC:
     return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
   case VPWidenLoadEVLSC:
   case VPWidenLoadSC:
     return true;
   case VPReplicateSC:
     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
         ->mayReadFromMemory();
   case VPWidenCallSC:
     return !cast<VPWidenCallRecipe>(this)
                 ->getCalledScalarFunction()
                 ->onlyWritesMemory();
   case VPWidenIntrinsicSC:
     return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
   case VPBranchOnMaskSC:
   case VPFirstOrderRecurrencePHISC:
   case VPPredInstPHISC:
   case VPScalarIVStepsSC:
   case VPWidenStoreEVLSC:
   case VPWidenStoreSC:
     return false;
   case VPBlendSC:
   case VPReductionEVLSC:
   case VPReductionSC:
   case VPVectorPointerSC:
   case VPWidenCanonicalIVSC:
   case VPWidenCastSC:
   case VPWidenGEPSC:
   case VPWidenIntOrFpInductionSC:
   case VPWidenPHISC:
   case VPWidenSC:
   case VPWidenSelectSC: {
     const Instruction *I =
         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
     (void)I;
     assert((!I || !I->mayReadFromMemory()) &&
            "underlying instruction may read from memory");
     return false;
   }
   default:
     // FIXME: Return false if the recipe represents an interleaved store.
     return true;
   }
 }

 bool VPRecipeBase::mayHaveSideEffects() const {
   switch (getVPDefID()) {
   case VPExpressionSC:
     return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
   case VPDerivedIVSC:
   case VPFirstOrderRecurrencePHISC:
   case VPPredInstPHISC:
   case VPVectorEndPointerSC:
     return false;
   case VPInstructionSC:
     return mayWriteToMemory();
   case VPWidenCallSC: {
     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
   }
   case VPWidenIntrinsicSC:
     return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
   case VPBlendSC:
   case VPReductionEVLSC:
   case VPReductionSC:
   case VPScalarIVStepsSC:
   case VPVectorPointerSC:
   case VPWidenCanonicalIVSC:
   case VPWidenCastSC:
   case VPWidenGEPSC:
   case VPWidenIntOrFpInductionSC:
   case VPWidenPHISC:
   case VPWidenPointerInductionSC:
   case VPWidenSC:
   case VPWidenSelectSC: {
     const Instruction *I =
         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
     (void)I;
     assert((!I || !I->mayHaveSideEffects()) &&
            "underlying instruction has side-effects");
     return false;
   }
   case VPInterleaveEVLSC:
   case VPInterleaveSC:
     return mayWriteToMemory();
   case VPWidenLoadEVLSC:
   case VPWidenLoadSC:
   case VPWidenStoreEVLSC:
   case VPWidenStoreSC:
     assert(
         cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
             mayWriteToMemory() &&
         "mayHaveSideffects result for ingredient differs from this "
         "implementation");
     return mayWriteToMemory();
   case VPReplicateSC: {
     auto *R = cast<VPReplicateRecipe>(this);
     return R->getUnderlyingInstr()->mayHaveSideEffects();
   }
   default:
     return true;
   }
 }

 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
   assert(!Parent && "Recipe already in some VPBasicBlock");
   assert(InsertPos->getParent() &&
          "Insertion position not in any VPBasicBlock");
   InsertPos->getParent()->insert(this, InsertPos->getIterator());
 }

 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
                                 iplist<VPRecipeBase>::iterator I) {
   assert(!Parent && "Recipe already in some VPBasicBlock");
   assert(I == BB.end() || I->getParent() == &BB);
   BB.insert(this, I);
 }

 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
   assert(!Parent && "Recipe already in some VPBasicBlock");
   assert(InsertPos->getParent() &&
          "Insertion position not in any VPBasicBlock");
   InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
 }

 void VPRecipeBase::removeFromParent() {
   assert(getParent() && "Recipe not in any VPBasicBlock");
   getParent()->getRecipeList().remove(getIterator());
   Parent = nullptr;
 }

 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
   assert(getParent() && "Recipe not in any VPBasicBlock");
   return getParent()->getRecipeList().erase(getIterator());
 }

 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
   removeFromParent();
   insertAfter(InsertPos);
 }

 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
                               iplist<VPRecipeBase>::iterator I) {
   removeFromParent();
   insertBefore(BB, I);
 }

 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
   // Get the underlying instruction for the recipe, if there is one. It is used
   // to
   //   * decide if cost computation should be skipped for this recipe,
   //   * apply forced target instruction cost.
   Instruction *UI = nullptr;
   if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
     UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
   else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
     UI = IG->getInsertPos();
   else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
     UI = &WidenMem->getIngredient();

   InstructionCost RecipeCost;
   if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
     RecipeCost = 0;
   } else {
     RecipeCost = computeCost(VF, Ctx);
     if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
         RecipeCost.isValid())
       RecipeCost = InstructionCost(ForceTargetInstructionCost);
   }

   LLVM_DEBUG({
     dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
     dump();
   });
   return RecipeCost;
 }

 InstructionCost VPRecipeBase::computeCost(ElementCount VF,
                                           VPCostContext &Ctx) const {
   llvm_unreachable("subclasses should implement computeCost");
 }

 bool VPRecipeBase::isPhi() const {
   return (getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC) ||
          isa<VPPhi, VPIRPhi>(this);
 }

 bool VPRecipeBase::isScalarCast() const {
   auto *VPI = dyn_cast<VPInstruction>(this);
   return VPI && Instruction::isCast(VPI->getOpcode());
 }

 InstructionCost
 VPPartialReductionRecipe::computeCost(ElementCount VF,
                                       VPCostContext &Ctx) const {
   std::optional<unsigned> Opcode;
   VPValue *Op = getOperand(0);
   VPRecipeBase *OpR = Op->getDefiningRecipe();

   // If the partial reduction is predicated, a select will be operand 0
   using namespace llvm::VPlanPatternMatch;
   if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
     OpR = Op->getDefiningRecipe();
   }

   Type *InputTypeA = nullptr, *InputTypeB = nullptr;
   TTI::PartialReductionExtendKind ExtAType = TTI::PR_None,
                                   ExtBType = TTI::PR_None;

   auto GetExtendKind = [](VPRecipeBase *R) {
     if (!R)
       return TTI::PR_None;
     auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R);
     if (!WidenCastR)
       return TTI::PR_None;
     if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt)
       return TTI::PR_ZeroExtend;
     if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
       return TTI::PR_SignExtend;
     return TTI::PR_None;
   };

   // Pick out opcode, type/ext information and use sub side effects from a widen
   // recipe.
   auto HandleWiden = [&](VPWidenRecipe *Widen) {
     if (match(Widen, m_Sub(m_ZeroInt(), m_VPValue(Op)))) {
       Widen = dyn_cast<VPWidenRecipe>(Op->getDefiningRecipe());
     }
     Opcode = Widen->getOpcode();
     VPRecipeBase *ExtAR = Widen->getOperand(0)->getDefiningRecipe();
     VPRecipeBase *ExtBR = Widen->getOperand(1)->getDefiningRecipe();
     InputTypeA = Ctx.Types.inferScalarType(ExtAR ? ExtAR->getOperand(0)
                                                  : Widen->getOperand(0));
     InputTypeB = Ctx.Types.inferScalarType(ExtBR ? ExtBR->getOperand(0)
                                                  : Widen->getOperand(1));
     ExtAType = GetExtendKind(ExtAR);
     ExtBType = GetExtendKind(ExtBR);
   };

   if (isa<VPWidenCastRecipe>(OpR)) {
     InputTypeA = Ctx.Types.inferScalarType(OpR->getOperand(0));
     ExtAType = GetExtendKind(OpR);
   } else if (isa<VPReductionPHIRecipe>(OpR)) {
     auto RedPhiOp1R = getOperand(1)->getDefiningRecipe();
     if (isa<VPWidenCastRecipe>(RedPhiOp1R)) {
       InputTypeA = Ctx.Types.inferScalarType(RedPhiOp1R->getOperand(0));
       ExtAType = GetExtendKind(RedPhiOp1R);
     } else if (auto Widen = dyn_cast<VPWidenRecipe>(RedPhiOp1R))
       HandleWiden(Widen);
   } else if (auto Widen = dyn_cast<VPWidenRecipe>(OpR)) {
     HandleWiden(Widen);
   } else if (auto Reduction = dyn_cast<VPPartialReductionRecipe>(OpR)) {
     return Reduction->computeCost(VF, Ctx);
   }
   auto *PhiType = Ctx.Types.inferScalarType(getOperand(1));
   return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB,
                                          PhiType, VF, ExtAType, ExtBType,
                                          Opcode, Ctx.CostKind);
 }

 void VPPartialReductionRecipe::execute(VPTransformState &State) {
   auto &Builder = State.Builder;

   assert(getOpcode() == Instruction::Add &&
          "Unhandled partial reduction opcode");

   Value *BinOpVal = State.get(getOperand(1));
   Value *PhiVal = State.get(getOperand(0));
   assert(PhiVal && BinOpVal && "Phi and Mul must be set");

   Type *RetTy = PhiVal->getType();

   CallInst *V =
       Builder.CreateIntrinsic(RetTy, Intrinsic::vector_partial_reduce_add,
                               {PhiVal, BinOpVal}, nullptr, "partial.reduce");

   State.set(this, V);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                      VPSlotTracker &SlotTracker) const {
   O << Indent << "PARTIAL-REDUCE ";
   printAsOperand(O, SlotTracker);
   O << " = " << Instruction::getOpcodeName(getOpcode()) << " ";
   printOperands(O, SlotTracker);
 }
 #endif

 void VPIRFlags::intersectFlags(const VPIRFlags &Other) {
   assert(OpType == Other.OpType && "OpType must match");
   switch (OpType) {
   case OperationType::OverflowingBinOp:
     WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
     WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
     break;
   case OperationType::Trunc:
     TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
     TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
     break;
   case OperationType::DisjointOp:
     DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
     break;
   case OperationType::PossiblyExactOp:
     ExactFlags.IsExact &= Other.ExactFlags.IsExact;
     break;
   case OperationType::GEPOp:
     GEPFlags &= Other.GEPFlags;
     break;
   case OperationType::FPMathOp:
     FMFs.NoNaNs &= Other.FMFs.NoNaNs;
     FMFs.NoInfs &= Other.FMFs.NoInfs;
     break;
   case OperationType::NonNegOp:
     NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
     break;
   case OperationType::Cmp:
     assert(CmpPredicate == Other.CmpPredicate && "Cannot drop CmpPredicate");
     break;
   case OperationType::Other:
     assert(AllFlags == Other.AllFlags && "Cannot drop other flags");
     break;
   }
 }

 FastMathFlags VPIRFlags::getFastMathFlags() const {
   assert(OpType == OperationType::FPMathOp &&
          "recipe doesn't have fast math flags");
   FastMathFlags Res;
   Res.setAllowReassoc(FMFs.AllowReassoc);
   Res.setNoNaNs(FMFs.NoNaNs);
   Res.setNoInfs(FMFs.NoInfs);
   Res.setNoSignedZeros(FMFs.NoSignedZeros);
   Res.setAllowReciprocal(FMFs.AllowReciprocal);
   Res.setAllowContract(FMFs.AllowContract);
   Res.setApproxFunc(FMFs.ApproxFunc);
   return Res;
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPSingleDefRecipe::dump() const { VPDef::dump(); }
 #endif

 template <unsigned PartOpIdx>
 VPValue *
 VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(const VPUser &U) const {
   if (U.getNumOperands() == PartOpIdx + 1)
     return U.getOperand(PartOpIdx);
   return nullptr;
 }

 template <unsigned PartOpIdx>
 unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(const VPUser &U) const {
   if (auto *UnrollPartOp = getUnrollPartOperand(U))
     return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
   return 0;
 }

 namespace llvm {
 template class VPUnrollPartAccessor<1>;
 template class VPUnrollPartAccessor<2>;
 template class VPUnrollPartAccessor<3>;
 }

 VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
                              const VPIRFlags &Flags, DebugLoc DL,
                              const Twine &Name)
     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL),
       VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {
   assert(flagsValidForOpcode(getOpcode()) &&
          "Set flags not supported for the provided opcode");
   assert((getNumOperandsForOpcode(Opcode) == -1u ||
           getNumOperandsForOpcode(Opcode) == getNumOperands()) &&
          "number of operands does not match opcode");
 }

 #ifndef NDEBUG
 unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
   if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
     return 1;

   if (Instruction::isBinaryOp(Opcode))
     return 2;

   switch (Opcode) {
   case VPInstruction::StepVector:
   case VPInstruction::VScale:
     return 0;
   case Instruction::Alloca:
   case Instruction::ExtractValue:
   case Instruction::Freeze:
   case Instruction::Load:
   case VPInstruction::AnyOf:
   case VPInstruction::BranchOnCond:
   case VPInstruction::BuildStructVector:
   case VPInstruction::BuildVector:
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::ExplicitVectorLength:
   case VPInstruction::ExtractLastElement:
   case VPInstruction::ExtractPenultimateElement:
   case VPInstruction::FirstActiveLane:
   case VPInstruction::Not:
     return 1;
   case Instruction::ICmp:
   case Instruction::FCmp:
   case Instruction::Store:
   case VPInstruction::BranchOnCount:
   case VPInstruction::ComputeReductionResult:
   case VPInstruction::FirstOrderRecurrenceSplice:
   case VPInstruction::LogicalAnd:
   case VPInstruction::PtrAdd:
   case VPInstruction::WidePtrAdd:
   case VPInstruction::WideIVStep:
     return 2;
   case Instruction::Select:
   case VPInstruction::ActiveLaneMask:
   case VPInstruction::ComputeAnyOfResult:
   case VPInstruction::ReductionStartVector:
     return 3;
   case VPInstruction::ComputeFindIVResult:
     return 4;
   case Instruction::Call:
   case Instruction::GetElementPtr:
   case Instruction::PHI:
   case Instruction::Switch:
     // Cannot determine the number of operands from the opcode.
     return -1u;
   }
   llvm_unreachable("all cases should be handled above");
 }
 #endif

 bool VPInstruction::doesGeneratePerAllLanes() const {
   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
 }

 bool VPInstruction::canGenerateScalarForFirstLane() const {
   if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
     return true;
   if (isSingleScalar() || isVectorToScalar())
     return true;
   switch (Opcode) {
   case Instruction::Freeze:
   case Instruction::ICmp:
   case Instruction::PHI:
   case Instruction::Select:
   case VPInstruction::BranchOnCond:
   case VPInstruction::BranchOnCount:
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::PtrAdd:
   case VPInstruction::ExplicitVectorLength:
   case VPInstruction::AnyOf:
   case VPInstruction::Not:
     return true;
   default:
     return false;
   }
 }

 /// Create a conditional branch using \p Cond branching to the successors of \p
 /// VPBB. Note that the first successor is always forward (i.e. not created yet)
 /// while the second successor may already have been created (if it is a header
 /// block and VPBB is a latch).
 static BranchInst *createCondBranch(Value *Cond, VPBasicBlock *VPBB,
                                     VPTransformState &State) {
   // Replace the temporary unreachable terminator with a new conditional
   // branch, hooking it up to backward destination (header) for latch blocks
   // now, and to forward destination(s) later when they are created.
   // Second successor may be backwards - iff it is already in VPBB2IRBB.
   VPBasicBlock *SecondVPSucc = cast<VPBasicBlock>(VPBB->getSuccessors()[1]);
   BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
   BasicBlock *IRBB = State.CFG.VPBB2IRBB[VPBB];
   BranchInst *CondBr = State.Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
   // First successor is always forward, reset it to nullptr
   CondBr->setSuccessor(0, nullptr);
   IRBB->getTerminator()->eraseFromParent();
   return CondBr;
 }

 Value *VPInstruction::generate(VPTransformState &State) {
   IRBuilderBase &Builder = State.Builder;

   if (Instruction::isBinaryOp(getOpcode())) {
     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
     Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
     auto *Res =
         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
     if (auto *I = dyn_cast<Instruction>(Res))
       applyFlags(*I);
     return Res;
   }

   switch (getOpcode()) {
   case VPInstruction::Not: {
     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
     return Builder.CreateNot(A, Name);
   }
   case Instruction::ExtractElement: {
     assert(State.VF.isVector() && "Only extract elements from vectors");
     if (getOperand(1)->isLiveIn()) {
       unsigned IdxToExtract =
           cast<ConstantInt>(getOperand(1)->getLiveInIRValue())->getZExtValue();
       return State.get(getOperand(0), VPLane(IdxToExtract));
     }
     Value *Vec = State.get(getOperand(0));
     Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
     return Builder.CreateExtractElement(Vec, Idx, Name);
   }
   case Instruction::Freeze: {
     Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this));
     return Builder.CreateFreeze(Op, Name);
   }
   case Instruction::FCmp:
   case Instruction::ICmp: {
     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
     Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
     return Builder.CreateCmp(getPredicate(), A, B, Name);
   }
   case Instruction::PHI: {
     llvm_unreachable("should be handled by VPPhi::execute");
   }
   case Instruction::Select: {
     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
     Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
     Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
     Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
     return Builder.CreateSelect(Cond, Op1, Op2, Name);
   }
   case VPInstruction::ActiveLaneMask: {
     // Get first lane of vector induction variable.
     Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
     // Get the original loop tripcount.
     Value *ScalarTC = State.get(getOperand(1), VPLane(0));

     // If this part of the active lane mask is scalar, generate the CMP directly
     // to avoid unnecessary extracts.
     if (State.VF.isScalar())
       return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
                                Name);

     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
     auto PredTy = VectorType::get(
         Int1Ty, State.VF * cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
                                ->getZExtValue());
     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
                                    {PredTy, ScalarTC->getType()},
                                    {VIVElem0, ScalarTC}, nullptr, Name);
   }
   case VPInstruction::FirstOrderRecurrenceSplice: {
     // Generate code to combine the previous and current values in vector v3.
     //
     //   vector.ph:
     //     v_init = vector(..., ..., ..., a[-1])
     //     br vector.body
     //
     //   vector.body
     //     i = phi [0, vector.ph], [i+4, vector.body]
     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
     //     v2 = a[i, i+1, i+2, i+3];
     //     v3 = vector(v1(3), v2(0, 1, 2))

     auto *V1 = State.get(getOperand(0));
     if (!V1->getType()->isVectorTy())
       return V1;
     Value *V2 = State.get(getOperand(1));
     return Builder.CreateVectorSplice(V1, V2, -1, Name);
   }
   case VPInstruction::CalculateTripCountMinusVF: {
     unsigned UF = getParent()->getPlan()->getUF();
     Value *ScalarTC = State.get(getOperand(0), VPLane(0));
     Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
     Value *Sub = Builder.CreateSub(ScalarTC, Step);
     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
     return Builder.CreateSelect(Cmp, Sub, Zero);
   }
   case VPInstruction::ExplicitVectorLength: {
     // TODO: Restructure this code with an explicit remainder loop, vsetvli can
     // be outside of the main loop.
     Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
     // Compute EVL
     assert(AVL->getType()->isIntegerTy() &&
            "Requested vector length should be an integer.");

     assert(State.VF.isScalable() && "Expected scalable vector factor.");
     Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());

     Value *EVL = State.Builder.CreateIntrinsic(
         State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
         {AVL, VFArg, State.Builder.getTrue()});
     return EVL;
   }
   case VPInstruction::CanonicalIVIncrementForPart: {
     unsigned Part = getUnrollPart(*this);
     auto *IV = State.get(getOperand(0), VPLane(0));
     assert(Part != 0 && "Must have a positive part");
     // The canonical IV is incremented by the vectorization factor (num of
     // SIMD elements) times the unroll part.
     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
                              hasNoSignedWrap());
   }
   case VPInstruction::BranchOnCond: {
     Value *Cond = State.get(getOperand(0), VPLane(0));
     auto *Br = createCondBranch(Cond, getParent(), State);
     applyMetadata(*Br);
     return Br;
   }
   case VPInstruction::BranchOnCount: {
     // First create the compare.
     Value *IV = State.get(getOperand(0), /*IsScalar*/ true);
     Value *TC = State.get(getOperand(1), /*IsScalar*/ true);
     Value *Cond = Builder.CreateICmpEQ(IV, TC);
     return createCondBranch(Cond, getParent(), State);
   }
   case VPInstruction::Broadcast: {
     return Builder.CreateVectorSplat(
         State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
   }
   case VPInstruction::BuildStructVector: {
     // For struct types, we need to build a new 'wide' struct type, where each
     // element is widened, i.e., we create a struct of vectors.
     auto *StructTy =
         cast<StructType>(State.TypeAnalysis.inferScalarType(getOperand(0)));
     Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
     for (const auto &[LaneIndex, Op] : enumerate(operands())) {
       for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
            FieldIndex++) {
         Value *ScalarValue =
             Builder.CreateExtractValue(State.get(Op, true), FieldIndex);
         Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex);
         VectorValue =
             Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex);
         Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex);
       }
     }
     return Res;
   }
   case VPInstruction::BuildVector: {
     auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
     auto NumOfElements = ElementCount::getFixed(getNumOperands());
     Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
     for (const auto &[Idx, Op] : enumerate(operands()))
       Res = State.Builder.CreateInsertElement(Res, State.get(Op, true),
                                               State.Builder.getInt32(Idx));
     return Res;
   }
   case VPInstruction::ReductionStartVector: {
     if (State.VF.isScalar())
       return State.get(getOperand(0), true);
     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
     Builder.setFastMathFlags(getFastMathFlags());
     // If this start vector is scaled then it should produce a vector with fewer
     // elements than the VF.
     ElementCount VF = State.VF.divideCoefficientBy(
         cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue());
     auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
     Constant *Zero = Builder.getInt32(0);
     return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
                                        Zero);
   }
   case VPInstruction::ComputeAnyOfResult: {
     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
     // and will be removed by breaking up the recipe further.
     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
     Value *ReducedPartRdx = State.get(getOperand(2));
     for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx)
       ReducedPartRdx = Builder.CreateBinOp(
           (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(
               RecurKind::AnyOf),
           State.get(getOperand(Idx)), ReducedPartRdx, "bin.rdx");
     return createAnyOfReduction(Builder, ReducedPartRdx,
                                 State.get(getOperand(1), VPLane(0)), OrigPhi);
   }
   case VPInstruction::ComputeFindIVResult: {
     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
     // and will be removed by breaking up the recipe further.
     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
     // Get its reduction variable descriptor.
     RecurKind RK = PhiR->getRecurrenceKind();
     assert(RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
            "Unexpected reduction kind");
     assert(!PhiR->isInLoop() &&
            "In-loop FindLastIV reduction is not supported yet");

     // The recipe's operands are the reduction phi, the start value, the
     // sentinel value, followed by one operand for each part of the reduction.
     unsigned UF = getNumOperands() - 3;
     Value *ReducedPartRdx = State.get(getOperand(3));
     RecurKind MinMaxKind;
     bool IsSigned = RecurrenceDescriptor::isSignedRecurrenceKind(RK);
     if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK))
       MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax;
     else
       MinMaxKind = IsSigned ? RecurKind::SMin : RecurKind::UMin;
     for (unsigned Part = 1; Part < UF; ++Part)
       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
                                       State.get(getOperand(3 + Part)));

     Value *Start = State.get(getOperand(1), true);
     Value *Sentinel = getOperand(2)->getLiveInIRValue();
     return createFindLastIVReduction(Builder, ReducedPartRdx, RK, Start,
                                      Sentinel);
   }
   case VPInstruction::ComputeReductionResult: {
     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
     // and will be removed by breaking up the recipe further.
     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
     // Get its reduction variable descriptor.

     RecurKind RK = PhiR->getRecurrenceKind();
     assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
            "should be handled by ComputeFindIVResult");

     // The recipe's operands are the reduction phi, followed by one operand for
     // each part of the reduction.
     unsigned UF = getNumOperands() - 1;
     VectorParts RdxParts(UF);
     for (unsigned Part = 0; Part < UF; ++Part)
       RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop());

     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
     if (hasFastMathFlags())
       Builder.setFastMathFlags(getFastMathFlags());

     // Reduce all of the unrolled parts into a single vector.
     Value *ReducedPartRdx = RdxParts[0];
     if (PhiR->isOrdered()) {
       ReducedPartRdx = RdxParts[UF - 1];
     } else {
       // Floating-point operations should have some FMF to enable the reduction.
       for (unsigned Part = 1; Part < UF; ++Part) {
         Value *RdxPart = RdxParts[Part];
         if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
         else {
           Instruction::BinaryOps Opcode;
           // For sub-recurrences, each UF's reduction variable is already
           // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
           if (RK == RecurKind::Sub)
             Opcode = Instruction::Add;
           else
             Opcode =
                 (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK);
           ReducedPartRdx =
               Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx");
         }
       }
     }

     // Create the reduction after the loop. Note that inloop reductions create
     // the target reduction in the loop using a Reduction recipe.
     if (State.VF.isVector() && !PhiR->isInLoop()) {
       // TODO: Support in-order reductions based on the recurrence descriptor.
       // All ops in the reduction inherit fast-math-flags from the recurrence
       // descriptor.
       ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
     }

     return ReducedPartRdx;
   }
   case VPInstruction::ExtractLastElement:
   case VPInstruction::ExtractPenultimateElement: {
     unsigned Offset = getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2;
     Value *Res;
     if (State.VF.isVector()) {
       assert(Offset <= State.VF.getKnownMinValue() &&
              "invalid offset to extract from");
       // Extract lane VF - Offset from the operand.
       Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
     } else {
       assert(Offset <= 1 && "invalid offset to extract from");
       Res = State.get(getOperand(0));
     }
     if (isa<ExtractElementInst>(Res))
       Res->setName(Name);
     return Res;
   }
   case VPInstruction::LogicalAnd: {
     Value *A = State.get(getOperand(0));
     Value *B = State.get(getOperand(1));
     return Builder.CreateLogicalAnd(A, B, Name);
   }
   case VPInstruction::PtrAdd: {
     assert(vputils::onlyFirstLaneUsed(this) &&
            "can only generate first lane for PtrAdd");
     Value *Ptr = State.get(getOperand(0), VPLane(0));
     Value *Addend = State.get(getOperand(1), VPLane(0));
     return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
   }
   case VPInstruction::WidePtrAdd: {
     Value *Ptr =
         State.get(getOperand(0), vputils::isSingleScalar(getOperand(0)));
     Value *Addend = State.get(getOperand(1));
     return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
   }
   case VPInstruction::AnyOf: {
     Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
     for (VPValue *Op : drop_begin(operands()))
       Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
     return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
   }
   case VPInstruction::ExtractLane: {
     Value *LaneToExtract = State.get(getOperand(0), true);
     Type *IdxTy = State.TypeAnalysis.inferScalarType(getOperand(0));
     Value *Res = nullptr;
     Value *RuntimeVF = getRuntimeVF(State.Builder, IdxTy, State.VF);

     for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
       Value *VectorStart =
           Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1));
       Value *VectorIdx = Idx == 1
                              ? LaneToExtract
                              : Builder.CreateSub(LaneToExtract, VectorStart);
       Value *Ext = State.VF.isScalar()
                        ? State.get(getOperand(Idx))
                        : Builder.CreateExtractElement(
                              State.get(getOperand(Idx)), VectorIdx);
       if (Res) {
         Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart);
         Res = Builder.CreateSelect(Cmp, Ext, Res);
       } else {
         Res = Ext;
       }
     }
     return Res;
   }
   case VPInstruction::FirstActiveLane: {
     if (getNumOperands() == 1) {
       Value *Mask = State.get(getOperand(0));
       return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
                                                   true, Name);
     }
     // If there are multiple operands, create a chain of selects to pick the
     // first operand with an active lane and add the number of lanes of the
     // preceding operands.
     Value *RuntimeVF =
         getRuntimeVF(State.Builder, State.Builder.getInt64Ty(), State.VF);
     unsigned LastOpIdx = getNumOperands() - 1;
     Value *Res = nullptr;
     for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
       Value *TrailingZeros =
           State.VF.isScalar()
               ? Builder.CreateZExt(
                     Builder.CreateICmpEQ(State.get(getOperand(Idx)),
                                          Builder.getFalse()),
                     Builder.getInt64Ty())
               : Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(),
                                                      State.get(getOperand(Idx)),
                                                      true, Name);
       Value *Current = Builder.CreateAdd(
           Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
       if (Res) {
         Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
         Res = Builder.CreateSelect(Cmp, Current, Res);
       } else {
         Res = Current;
       }
     }

     return Res;
   }
   case VPInstruction::ResumeForEpilogue:
     return State.get(getOperand(0), true);
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
 }

 InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
     unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
   Type *ScalarTy = Ctx.Types.inferScalarType(this);
   Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy;
   switch (Opcode) {
   case Instruction::FNeg:
     return Ctx.TTI.getArithmeticInstrCost(Opcode, ResultTy, Ctx.CostKind);
   case Instruction::UDiv:
   case Instruction::SDiv:
   case Instruction::SRem:
   case Instruction::URem:
   case Instruction::Add:
   case Instruction::FAdd:
   case Instruction::Sub:
   case Instruction::FSub:
   case Instruction::Mul:
   case Instruction::FMul:
   case Instruction::FDiv:
   case Instruction::FRem:
   case Instruction::Shl:
   case Instruction::LShr:
   case Instruction::AShr:
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor: {
     TargetTransformInfo::OperandValueInfo RHSInfo = {
         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};

     if (VF.isVector()) {
       // Certain instructions can be cheaper to vectorize if they have a
       // constant second vector operand. One example of this are shifts on x86.
       VPValue *RHS = getOperand(1);
       RHSInfo = Ctx.getOperandInfo(RHS);

       if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
           getOperand(1)->isDefinedOutsideLoopRegions())
         RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
     }

     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
     SmallVector<const Value *, 4> Operands;
     if (CtxI)
       Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
     return Ctx.TTI.getArithmeticInstrCost(
         Opcode, ResultTy, Ctx.CostKind,
         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
         RHSInfo, Operands, CtxI, &Ctx.TLI);
   }
   case Instruction::Freeze:
     // This opcode is unknown. Assume that it is the same as 'mul'.
     return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, ResultTy,
                                           Ctx.CostKind);
   case Instruction::ExtractValue:
     return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
                                              Ctx.CostKind);
   case Instruction::ICmp:
   case Instruction::FCmp: {
     Type *ScalarOpTy = Ctx.Types.inferScalarType(getOperand(0));
     Type *OpTy = VF.isVector() ? toVectorTy(ScalarOpTy, VF) : ScalarOpTy;
     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
     return Ctx.TTI.getCmpSelInstrCost(
         Opcode, OpTy, CmpInst::makeCmpResultType(OpTy), getPredicate(),
         Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
         {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
   }
   }
   llvm_unreachable("called for unsupported opcode");
 }

 InstructionCost VPInstruction::computeCost(ElementCount VF,
                                            VPCostContext &Ctx) const {
   if (Instruction::isBinaryOp(getOpcode())) {
     if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
       // TODO: Compute cost for VPInstructions without underlying values once
       // the legacy cost model has been retired.
       return 0;
     }

     assert(!doesGeneratePerAllLanes() &&
            "Should only generate a vector value or single scalar, not scalars "
            "for all lanes.");
     return getCostForRecipeWithOpcode(
         getOpcode(),
         vputils::onlyFirstLaneUsed(this) ? ElementCount::getFixed(1) : VF, Ctx);
   }

   switch (getOpcode()) {
   case Instruction::Select: {
     // TODO: It may be possible to improve this by analyzing where the
     // condition operand comes from.
     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
     auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
     auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
     if (!vputils::onlyFirstLaneUsed(this)) {
       CondTy = toVectorTy(CondTy, VF);
       VecTy = toVectorTy(VecTy, VF);
     }
     return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
                                       Ctx.CostKind);
   }
   case Instruction::ExtractElement:
   case VPInstruction::ExtractLane: {
     if (VF.isScalar()) {
       // ExtractLane with VF=1 takes care of handling extracting across multiple
       // parts.
       return 0;
     }

     // Add on the cost of extracting the element.
     auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
     return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
                                       Ctx.CostKind);
   }
   case VPInstruction::AnyOf: {
     auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
     return Ctx.TTI.getArithmeticReductionCost(
         Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
   }
   case VPInstruction::FirstActiveLane: {
     Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
     if (VF.isScalar())
       return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
                                         CmpInst::makeCmpResultType(ScalarTy),
                                         CmpInst::ICMP_EQ, Ctx.CostKind);
     // Calculate the cost of determining the lane index.
     auto *PredTy = toVectorTy(ScalarTy, VF);
     IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
                                   Type::getInt64Ty(Ctx.LLVMCtx),
                                   {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
     return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
   }
   case VPInstruction::FirstOrderRecurrenceSplice: {
     assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
     SmallVector<int> Mask(VF.getKnownMinValue());
     std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);

     return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
                                   cast<VectorType>(VectorTy),
                                   cast<VectorType>(VectorTy), Mask,
                                   Ctx.CostKind, VF.getKnownMinValue() - 1);
   }
   case VPInstruction::ActiveLaneMask: {
     Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
     unsigned Multiplier =
         cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue();
     Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
     IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
                                   {ArgTy, ArgTy});
     return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
   }
   case VPInstruction::ExplicitVectorLength: {
     Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
     Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
     Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
     IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
                                   I32Ty, {Arg0Ty, I32Ty, I1Ty});
     return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
   }
   case VPInstruction::ExtractLastElement: {
     // Add on the cost of extracting the element.
     auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
     return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
                                                     VecTy, Ctx.CostKind, 0);
   }
   case VPInstruction::ExtractPenultimateElement:
     if (VF == ElementCount::getScalable(1))
       return InstructionCost::getInvalid();
   LLVM_FALLTHROUGH;
   default:
     // TODO: Compute cost other VPInstructions once the legacy cost model has
     // been retired.
     assert(!getUnderlyingValue() &&
            "unexpected VPInstruction witht underlying value");
     return 0;
   }
 }

 bool VPInstruction::isVectorToScalar() const {
   return getOpcode() == VPInstruction::ExtractLastElement ||
          getOpcode() == VPInstruction::ExtractPenultimateElement ||
          getOpcode() == Instruction::ExtractElement ||
          getOpcode() == VPInstruction::ExtractLane ||
          getOpcode() == VPInstruction::FirstActiveLane ||
          getOpcode() == VPInstruction::ComputeAnyOfResult ||
          getOpcode() == VPInstruction::ComputeFindIVResult ||
          getOpcode() == VPInstruction::ComputeReductionResult ||
          getOpcode() == VPInstruction::AnyOf;
 }

 bool VPInstruction::isSingleScalar() const {
   switch (getOpcode()) {
   case Instruction::PHI:
   case VPInstruction::ExplicitVectorLength:
   case VPInstruction::ResumeForEpilogue:
   case VPInstruction::VScale:
     return true;
   default:
     return isScalarCast();
   }
 }

 void VPInstruction::execute(VPTransformState &State) {
   assert(!State.Lane && "VPInstruction executing an Lane");
   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
   assert(flagsValidForOpcode(getOpcode()) &&
          "Set flags not supported for the provided opcode");
   if (hasFastMathFlags())
     State.Builder.setFastMathFlags(getFastMathFlags());
   Value *GeneratedValue = generate(State);
   if (!hasResult())
     return;
   assert(GeneratedValue && "generate must produce a value");
   bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
                                    (vputils::onlyFirstLaneUsed(this) ||
                                     isVectorToScalar() || isSingleScalar());
   assert((((GeneratedValue->getType()->isVectorTy() ||
             GeneratedValue->getType()->isStructTy()) ==
            !GeneratesPerFirstLaneOnly) ||
           State.VF.isScalar()) &&
          "scalar value but not only first lane defined");
   State.set(this, GeneratedValue,
             /*IsScalar*/ GeneratesPerFirstLaneOnly);
 }

 bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
     return false;
   switch (getOpcode()) {
   case Instruction::ExtractElement:
   case Instruction::Freeze:
   case Instruction::FCmp:
   case Instruction::ICmp:
   case Instruction::Select:
   case Instruction::PHI:
   case VPInstruction::AnyOf:
   case VPInstruction::BuildStructVector:
   case VPInstruction::BuildVector:
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::ExtractLane:
   case VPInstruction::ExtractLastElement:
   case VPInstruction::ExtractPenultimateElement:
   case VPInstruction::FirstActiveLane:
   case VPInstruction::FirstOrderRecurrenceSplice:
   case VPInstruction::LogicalAnd:
   case VPInstruction::Not:
   case VPInstruction::PtrAdd:
   case VPInstruction::WideIVStep:
   case VPInstruction::WidePtrAdd:
   case VPInstruction::StepVector:
   case VPInstruction::ReductionStartVector:
   case VPInstruction::VScale:
     return false;
   default:
     return true;
   }
 }

 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
   if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
     return vputils::onlyFirstLaneUsed(this);

   switch (getOpcode()) {
   default:
     return false;
   case Instruction::ExtractElement:
     return Op == getOperand(1);
   case Instruction::PHI:
     return true;
   case Instruction::FCmp:
   case Instruction::ICmp:
   case Instruction::Select:
   case Instruction::Or:
   case Instruction::Freeze:
   case VPInstruction::Not:
     // TODO: Cover additional opcodes.
     return vputils::onlyFirstLaneUsed(this);
   case VPInstruction::ActiveLaneMask:
   case VPInstruction::ExplicitVectorLength:
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::BranchOnCount:
   case VPInstruction::BranchOnCond:
   case VPInstruction::Broadcast:
   case VPInstruction::ReductionStartVector:
     return true;
   case VPInstruction::BuildStructVector:
   case VPInstruction::BuildVector:
     // Before replicating by VF, Build(Struct)Vector uses all lanes of the
     // operand, after replicating its operands only the first lane is used.
     // Before replicating, it will have only a single operand.
     return getNumOperands() > 1;
   case VPInstruction::PtrAdd:
     return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
   case VPInstruction::WidePtrAdd:
     return Op == getOperand(0);
   case VPInstruction::ComputeAnyOfResult:
   case VPInstruction::ComputeFindIVResult:
     return Op == getOperand(1);
   case VPInstruction::ExtractLane:
     return Op == getOperand(0);
   };
   llvm_unreachable("switch should return");
 }

 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
   if (Instruction::isBinaryOp(getOpcode()))
     return vputils::onlyFirstPartUsed(this);

   switch (getOpcode()) {
   default:
     return false;
   case Instruction::FCmp:
   case Instruction::ICmp:
   case Instruction::Select:
     return vputils::onlyFirstPartUsed(this);
   case VPInstruction::BranchOnCount:
   case VPInstruction::BranchOnCond:
   case VPInstruction::CanonicalIVIncrementForPart:
     return true;
   };
   llvm_unreachable("switch should return");
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPInstruction::dump() const {
   VPSlotTracker SlotTracker(getParent()->getPlan());
   print(dbgs(), "", SlotTracker);
 }

 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
   O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";

   if (hasResult()) {
     printAsOperand(O, SlotTracker);
     O << " = ";
   }

   switch (getOpcode()) {
   case VPInstruction::Not:
     O << "not";
     break;
   case VPInstruction::SLPLoad:
     O << "combined load";
     break;
   case VPInstruction::SLPStore:
     O << "combined store";
     break;
   case VPInstruction::ActiveLaneMask:
     O << "active lane mask";
     break;
   case VPInstruction::ExplicitVectorLength:
     O << "EXPLICIT-VECTOR-LENGTH";
     break;
   case VPInstruction::FirstOrderRecurrenceSplice:
     O << "first-order splice";
     break;
   case VPInstruction::BranchOnCond:
     O << "branch-on-cond";
     break;
   case VPInstruction::CalculateTripCountMinusVF:
     O << "TC > VF ? TC - VF : 0";
     break;
   case VPInstruction::CanonicalIVIncrementForPart:
     O << "VF * Part +";
     break;
   case VPInstruction::BranchOnCount:
     O << "branch-on-count";
     break;
   case VPInstruction::Broadcast:
     O << "broadcast";
     break;
   case VPInstruction::BuildStructVector:
     O << "buildstructvector";
     break;
   case VPInstruction::BuildVector:
     O << "buildvector";
     break;
   case VPInstruction::ExtractLane:
     O << "extract-lane";
     break;
   case VPInstruction::ExtractLastElement:
     O << "extract-last-element";
     break;
   case VPInstruction::ExtractPenultimateElement:
     O << "extract-penultimate-element";
     break;
   case VPInstruction::ComputeAnyOfResult:
     O << "compute-anyof-result";
     break;
   case VPInstruction::ComputeFindIVResult:
     O << "compute-find-iv-result";
     break;
   case VPInstruction::ComputeReductionResult:
     O << "compute-reduction-result";
     break;
   case VPInstruction::LogicalAnd:
     O << "logical-and";
     break;
   case VPInstruction::PtrAdd:
     O << "ptradd";
     break;
   case VPInstruction::WidePtrAdd:
     O << "wide-ptradd";
     break;
   case VPInstruction::AnyOf:
     O << "any-of";
     break;
   case VPInstruction::FirstActiveLane:
     O << "first-active-lane";
     break;
   case VPInstruction::ReductionStartVector:
     O << "reduction-start-vector";
     break;
   case VPInstruction::ResumeForEpilogue:
     O << "resume-for-epilogue";
     break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }

   printFlags(O);
   printOperands(O, SlotTracker);

   if (auto DL = getDebugLoc()) {
     O << ", !dbg ";
     DL.print(O);
   }
 }
 #endif

 void VPInstructionWithType::execute(VPTransformState &State) {
   State.setDebugLocFrom(getDebugLoc());
   if (isScalarCast()) {
     Value *Op = State.get(getOperand(0), VPLane(0));
     Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
                                            Op, ResultTy);
     State.set(this, Cast, VPLane(0));
     return;
   }
   switch (getOpcode()) {
   case VPInstruction::StepVector: {
     Value *StepVector =
         State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
     State.set(this, StepVector);
     break;
   }
   case VPInstruction::VScale: {
     Value *VScale = State.Builder.CreateVScale(ResultTy);
     State.set(this, VScale, true);
     break;
   }

   default:
     llvm_unreachable("opcode not implemented yet");
   }
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
   O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
   printAsOperand(O, SlotTracker);
   O << " = ";

   switch (getOpcode()) {
   case VPInstruction::WideIVStep:
     O << "wide-iv-step ";
     printOperands(O, SlotTracker);
     break;
   case VPInstruction::StepVector:
     O << "step-vector " << *ResultTy;
     break;
   case VPInstruction::VScale:
     O << "vscale " << *ResultTy;
     break;
   default:
     assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
     O << Instruction::getOpcodeName(getOpcode()) << " ";
     printOperands(O, SlotTracker);
     O << " to " << *ResultTy;
   }
 }
 #endif

 void VPPhi::execute(VPTransformState &State) {
   State.setDebugLocFrom(getDebugLoc());
   PHINode *NewPhi = State.Builder.CreatePHI(
       State.TypeAnalysis.inferScalarType(this), 2, getName());
   unsigned NumIncoming = getNumIncoming();
   if (getParent() != getParent()->getPlan()->getScalarPreheader()) {
     // TODO: Fixup all incoming values of header phis once recipes defining them
     // are introduced.
     NumIncoming = 1;
   }
   for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
     Value *IncV = State.get(getIncomingValue(Idx), VPLane(0));
     BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx));
     NewPhi->addIncoming(IncV, PredBB);
   }
   State.set(this, NewPhi, VPLane(0));
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPPhi::print(raw_ostream &O, const Twine &Indent,
                   VPSlotTracker &SlotTracker) const {
   O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
   printAsOperand(O, SlotTracker);
   O << " = phi ";
   printPhiOperands(O, SlotTracker);
 }
 #endif

 VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
   if (auto *Phi = dyn_cast<PHINode>(&I))
     return new VPIRPhi(*Phi);
   return new VPIRInstruction(I);
 }

 void VPIRInstruction::execute(VPTransformState &State) {
   assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
          "PHINodes must be handled by VPIRPhi");
   // Advance the insert point after the wrapped IR instruction. This allows
   // interleaving VPIRInstructions and other recipes.
   State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
 }

 InstructionCost VPIRInstruction::computeCost(ElementCount VF,
                                              VPCostContext &Ctx) const {
   // The recipe wraps an existing IR instruction on the border of VPlan's scope,
   // hence it does not contribute to the cost-modeling for the VPlan.
   return 0;
 }

 void VPIRInstruction::extractLastLaneOfFirstOperand(VPBuilder &Builder) {
   assert(isa<PHINode>(getInstruction()) &&
          "can only update exiting operands to phi nodes");
   assert(getNumOperands() > 0 && "must have at least one operand");
   VPValue *Exiting = getOperand(0);
   if (Exiting->isLiveIn())
     return;

   Exiting = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Exiting});
   setOperand(0, Exiting);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
                             VPSlotTracker &SlotTracker) const {
   O << Indent << "IR " << I;
 }
 #endif

 void VPIRPhi::execute(VPTransformState &State) {
   PHINode *Phi = &getIRPhi();
   for (const auto &[Idx, Op] : enumerate(operands())) {
     VPValue *ExitValue = Op;
     auto Lane = vputils::isSingleScalar(ExitValue)
                     ? VPLane::getFirstLane()
                     : VPLane::getLastLaneForVF(State.VF);
     VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
     auto *PredVPBB = Pred->getExitingBasicBlock();
     BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
     // Set insertion point in PredBB in case an extract needs to be generated.
     // TODO: Model extracts explicitly.
     State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
     Value *V = State.get(ExitValue, VPLane(Lane));
     // If there is no existing block for PredBB in the phi, add a new incoming
     // value. Otherwise update the existing incoming value for PredBB.
     if (Phi->getBasicBlockIndex(PredBB) == -1)
       Phi->addIncoming(V, PredBB);
     else
       Phi->setIncomingValueForBlock(PredBB, V);
   }

   // Advance the insert point after the wrapped IR instruction. This allows
   // interleaving VPIRInstructions and other recipes.
   State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
 }

 void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const {
   VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
   assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
          "Number of phi operands must match number of predecessors");
   unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
   R->removeOperand(Position);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPPhiAccessors::printPhiOperands(raw_ostream &O,
                                       VPSlotTracker &SlotTracker) const {
   interleaveComma(enumerate(getAsRecipe()->operands()), O,
                   [this, &O, &SlotTracker](auto Op) {
                     O << "[ ";
                     Op.value()->printAsOperand(O, SlotTracker);
                     O << ", ";
                     getIncomingBlock(Op.index())->printAsOperand(O);
                     O << " ]";
                   });
 }
 #endif

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPIRPhi::print(raw_ostream &O, const Twine &Indent,
                     VPSlotTracker &SlotTracker) const {
   VPIRInstruction::print(O, Indent, SlotTracker);

   if (getNumOperands() != 0) {
     O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
     interleaveComma(incoming_values_and_blocks(), O,
                     [&O, &SlotTracker](auto Op) {
                       std::get<0>(Op)->printAsOperand(O, SlotTracker);
                       O << " from ";
                       std::get<1>(Op)->printAsOperand(O);
                     });
     O << ")";
   }
 }
 #endif

 VPIRMetadata::VPIRMetadata(Instruction &I, LoopVersioning *LVer)
     : VPIRMetadata(I) {
   if (!LVer || !isa<LoadInst, StoreInst>(&I))
     return;
   const auto &[AliasScopeMD, NoAliasMD] = LVer->getNoAliasMetadataFor(&I);
   if (AliasScopeMD)
     Metadata.emplace_back(LLVMContext::MD_alias_scope, AliasScopeMD);
   if (NoAliasMD)
     Metadata.emplace_back(LLVMContext::MD_noalias, NoAliasMD);
 }

 void VPIRMetadata::applyMetadata(Instruction &I) const {
   for (const auto &[Kind, Node] : Metadata)
     I.setMetadata(Kind, Node);
 }

 void VPIRMetadata::intersect(const VPIRMetadata &Other) {
   SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
   for (const auto &[KindA, MDA] : Metadata) {
     for (const auto &[KindB, MDB] : Other.Metadata) {
       if (KindA == KindB && MDA == MDB) {
         MetadataIntersection.emplace_back(KindA, MDA);
         break;
       }
     }
   }
   Metadata = std::move(MetadataIntersection);
 }

 void VPWidenCallRecipe::execute(VPTransformState &State) {
   assert(State.VF.isVector() && "not widening");
   assert(Variant != nullptr && "Can't create vector function.");

   FunctionType *VFTy = Variant->getFunctionType();
   // Add return type if intrinsic is overloaded on it.
   SmallVector<Value *, 4> Args;
   for (const auto &I : enumerate(args())) {
     Value *Arg;
     // Some vectorized function variants may also take a scalar argument,
     // e.g. linear parameters for pointers. This needs to be the scalar value
     // from the start of the respective part when interleaving.
     if (!VFTy->getParamType(I.index())->isVectorTy())
       Arg = State.get(I.value(), VPLane(0));
     else
       Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
     Args.push_back(Arg);
   }

   auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
   SmallVector<OperandBundleDef, 1> OpBundles;
   if (CI)
     CI->getOperandBundlesAsDefs(OpBundles);

   CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
   applyFlags(*V);
   applyMetadata(*V);
   V->setCallingConv(Variant->getCallingConv());

   if (!V->getType()->isVoidTy())
     State.set(this, V);
 }

 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
   return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
                                   Variant->getFunctionType()->params(),
                                   Ctx.CostKind);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-CALL ";

   Function *CalledFn = getCalledScalarFunction();
   if (CalledFn->getReturnType()->isVoidTy())
     O << "void ";
   else {
     printAsOperand(O, SlotTracker);
     O << " = ";
   }

   O << "call";
   printFlags(O);
   O << " @" << CalledFn->getName() << "(";
   interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
     Op->printAsOperand(O, SlotTracker);
   });
   O << ")";

   O << " (using library function";
   if (Variant->hasName())
     O << ": " << Variant->getName();
   O << ")";
 }
 #endif

 void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
   assert(State.VF.isVector() && "not widening");

   SmallVector<Type *, 2> TysForDecl;
   // Add return type if intrinsic is overloaded on it.
   if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI))
     TysForDecl.push_back(VectorType::get(getResultType(), State.VF));
   SmallVector<Value *, 4> Args;
   for (const auto &I : enumerate(operands())) {
     // Some intrinsics have a scalar argument - don't replace it with a
     // vector.
     Value *Arg;
     if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
                                            State.TTI))
       Arg = State.get(I.value(), VPLane(0));
     else
       Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
     if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
                                                State.TTI))
       TysForDecl.push_back(Arg->getType());
     Args.push_back(Arg);
   }

   // Use vector version of the intrinsic.
   Module *M = State.Builder.GetInsertBlock()->getModule();
   Function *VectorF =
       Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
   assert(VectorF &&
          "Can't retrieve vector intrinsic or vector-predication intrinsics.");

   auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
   SmallVector<OperandBundleDef, 1> OpBundles;
   if (CI)
     CI->getOperandBundlesAsDefs(OpBundles);

   CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);

   applyFlags(*V);
   applyMetadata(*V);

   if (!V->getType()->isVoidTy())
     State.set(this, V);
 }

 /// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
 static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
                                             ArrayRef<const VPValue *> Operands,
                                             const VPRecipeWithIRFlags &R,
                                             ElementCount VF,
                                             VPCostContext &Ctx) {
   // Some backends analyze intrinsic arguments to determine cost. Use the
   // underlying value for the operand if it has one. Otherwise try to use the
   // operand of the underlying call instruction, if there is one. Otherwise
   // clear Arguments.
   // TODO: Rework TTI interface to be independent of concrete IR values.
   SmallVector<const Value *> Arguments;
   for (const auto &[Idx, Op] : enumerate(Operands)) {
     auto *V = Op->getUnderlyingValue();
     if (!V) {
       if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
         Arguments.push_back(UI->getArgOperand(Idx));
         continue;
       }
       Arguments.clear();
       break;
     }
     Arguments.push_back(V);
   }

   Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
   Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
   SmallVector<Type *> ParamTys;
   for (const VPValue *Op : Operands) {
     ParamTys.push_back(VF.isVector()
                            ? toVectorTy(Ctx.Types.inferScalarType(Op), VF)
                            : Ctx.Types.inferScalarType(Op));
   }

   // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
   FastMathFlags FMF =
       R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags();
   IntrinsicCostAttributes CostAttrs(
       ID, RetTy, Arguments, ParamTys, FMF,
       dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
       InstructionCost::getInvalid(), &Ctx.TLI);
   return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
 }

 InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
                                                     VPCostContext &Ctx) const {
   SmallVector<const VPValue *> ArgOps(operands());
   return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
 }

 StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
   return Intrinsic::getBaseName(VectorIntrinsicID);
 }

 bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
   return all_of(enumerate(operands()), [this, &Op](const auto &X) {
     auto [Idx, V] = X;
     return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(),
                                                          Idx, nullptr);
   });
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent,
                                    VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-INTRINSIC ";
   if (ResultTy->isVoidTy()) {
     O << "void ";
   } else {
     printAsOperand(O, SlotTracker);
     O << " = ";
   }

   O << "call";
   printFlags(O);
   O << getIntrinsicName() << "(";

   interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
     Op->printAsOperand(O, SlotTracker);
   });
   O << ")";
 }
 #endif

 void VPHistogramRecipe::execute(VPTransformState &State) {
   IRBuilderBase &Builder = State.Builder;

   Value *Address = State.get(getOperand(0));
   Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
   VectorType *VTy = cast<VectorType>(Address->getType());

   // The histogram intrinsic requires a mask even if the recipe doesn't;
   // if the mask operand was omitted then all lanes should be executed and
   // we just need to synthesize an all-true mask.
   Value *Mask = nullptr;
   if (VPValue *VPMask = getMask())
     Mask = State.get(VPMask);
   else
     Mask =
         Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));

   // If this is a subtract, we want to invert the increment amount. We may
   // add a separate intrinsic in future, but for now we'll try this.
   if (Opcode == Instruction::Sub)
     IncAmt = Builder.CreateNeg(IncAmt);
   else
     assert(Opcode == Instruction::Add && "only add or sub supported for now");

   State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
                                 {VTy, IncAmt->getType()},
                                 {Address, IncAmt, Mask});
 }

 InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
   // FIXME: Take the gather and scatter into account as well. For now we're
   //        generating the same cost as the fallback path, but we'll likely
   //        need to create a new TTI method for determining the cost, including
   //        whether we can use base + vec-of-smaller-indices or just
   //        vec-of-pointers.
   assert(VF.isVector() && "Invalid VF for histogram cost");
   Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
   VPValue *IncAmt = getOperand(1);
   Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
   VectorType *VTy = VectorType::get(IncTy, VF);

   // Assume that a non-constant update value (or a constant != 1) requires
   // a multiply, and add that into the cost.
   InstructionCost MulCost =
       Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
   if (IncAmt->isLiveIn()) {
     ConstantInt *CI = dyn_cast<ConstantInt>(IncAmt->getLiveInIRValue());

     if (CI && CI->getZExtValue() == 1)
       MulCost = TTI::TCC_Free;
   }

   // Find the cost of the histogram operation itself.
   Type *PtrTy = VectorType::get(AddressTy, VF);
   Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
   IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
                               Type::getVoidTy(Ctx.LLVMCtx),
                               {PtrTy, IncTy, MaskTy});

   // Add the costs together with the add/sub operation.
   return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
          Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-HISTOGRAM buckets: ";
   getOperand(0)->printAsOperand(O, SlotTracker);

   if (Opcode == Instruction::Sub)
     O << ", dec: ";
   else {
     assert(Opcode == Instruction::Add);
     O << ", inc: ";
   }
   getOperand(1)->printAsOperand(O, SlotTracker);

   if (VPValue *Mask = getMask()) {
     O << ", mask: ";
     Mask->printAsOperand(O, SlotTracker);
   }
 }

 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
                                 VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-SELECT ";
   printAsOperand(O, SlotTracker);
   O << " = select ";
   printFlags(O);
   getOperand(0)->printAsOperand(O, SlotTracker);
   O << ", ";
   getOperand(1)->printAsOperand(O, SlotTracker);
   O << ", ";
   getOperand(2)->printAsOperand(O, SlotTracker);
   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
 }
 #endif

 void VPWidenSelectRecipe::execute(VPTransformState &State) {
   // The condition can be loop invariant but still defined inside the
   // loop. This means that we can't just use the original 'cond' value.
   // We have to take the 'vectorized' value and pick the first lane.
   // Instcombine will make this a no-op.
   Value *Cond = State.get(getCond(), isInvariantCond());

   Value *Op0 = State.get(getOperand(1));
   Value *Op1 = State.get(getOperand(2));
   Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
   State.set(this, Sel);
   if (auto *I = dyn_cast<Instruction>(Sel)) {
     if (isa<FPMathOperator>(I))
       applyFlags(*I);
     applyMetadata(*I);
   }
 }

 InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
                                                  VPCostContext &Ctx) const {
   SelectInst *SI = cast<SelectInst>(getUnderlyingValue());
   bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
   Type *ScalarTy = Ctx.Types.inferScalarType(this);
   Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);

   VPValue *Op0, *Op1;
   using namespace llvm::VPlanPatternMatch;
   if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
       (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
        match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
     // select x, y, false --> x & y
     // select x, true, y --> x | y
     const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
     const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);

     SmallVector<const Value *, 2> Operands;
     if (all_of(operands(),
                [](VPValue *Op) { return Op->getUnderlyingValue(); }))
       Operands.append(SI->op_begin(), SI->op_end());
     bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
     return Ctx.TTI.getArithmeticInstrCost(
         IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy,
         Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
   }

   Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
   if (!ScalarCond)
     CondTy = VectorType::get(CondTy, VF);

   CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
   if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
     Pred = Cmp->getPredicate();
   return Ctx.TTI.getCmpSelInstrCost(
       Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
       {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
 }

 VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
   AllowReassoc = FMF.allowReassoc();
   NoNaNs = FMF.noNaNs();
   NoInfs = FMF.noInfs();
   NoSignedZeros = FMF.noSignedZeros();
   AllowReciprocal = FMF.allowReciprocal();
   AllowContract = FMF.allowContract();
   ApproxFunc = FMF.approxFunc();
 }

 #if !defined(NDEBUG)
 bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
   switch (OpType) {
   case OperationType::OverflowingBinOp:
     return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
            Opcode == Instruction::Mul ||
            Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
   case OperationType::Trunc:
     return Opcode == Instruction::Trunc;
   case OperationType::DisjointOp:
     return Opcode == Instruction::Or;
   case OperationType::PossiblyExactOp:
     return Opcode == Instruction::AShr;
   case OperationType::GEPOp:
     return Opcode == Instruction::GetElementPtr ||
            Opcode == VPInstruction::PtrAdd ||
            Opcode == VPInstruction::WidePtrAdd;
   case OperationType::FPMathOp:
     return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
            Opcode == Instruction::FSub || Opcode == Instruction::FNeg ||
            Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
            Opcode == Instruction::FCmp || Opcode == Instruction::Select ||
            Opcode == VPInstruction::WideIVStep ||
            Opcode == VPInstruction::ReductionStartVector ||
            Opcode == VPInstruction::ComputeReductionResult;
   case OperationType::NonNegOp:
     return Opcode == Instruction::ZExt;
     break;
   case OperationType::Cmp:
     return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
   case OperationType::Other:
     return true;
   }
   llvm_unreachable("Unknown OperationType enum");
 }
 #endif

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPIRFlags::printFlags(raw_ostream &O) const {
   switch (OpType) {
   case OperationType::Cmp:
     O << " " << CmpInst::getPredicateName(getPredicate());
     break;
   case OperationType::DisjointOp:
     if (DisjointFlags.IsDisjoint)
       O << " disjoint";
     break;
   case OperationType::PossiblyExactOp:
     if (ExactFlags.IsExact)
       O << " exact";
     break;
   case OperationType::OverflowingBinOp:
     if (WrapFlags.HasNUW)
       O << " nuw";
     if (WrapFlags.HasNSW)
       O << " nsw";
     break;
   case OperationType::Trunc:
     if (TruncFlags.HasNUW)
       O << " nuw";
     if (TruncFlags.HasNSW)
       O << " nsw";
     break;
   case OperationType::FPMathOp:
     getFastMathFlags().print(O);
     break;
   case OperationType::GEPOp:
     if (GEPFlags.isInBounds())
       O << " inbounds";
     else if (GEPFlags.hasNoUnsignedSignedWrap())
       O << " nusw";
     if (GEPFlags.hasNoUnsignedWrap())
       O << " nuw";
     break;
   case OperationType::NonNegOp:
     if (NonNegFlags.NonNeg)
       O << " nneg";
     break;
   case OperationType::Other:
     break;
   }
   O << " ";
 }
 #endif

 void VPWidenRecipe::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
   switch (Opcode) {
   case Instruction::Call:
   case Instruction::Br:
   case Instruction::PHI:
   case Instruction::GetElementPtr:
   case Instruction::Select:
     llvm_unreachable("This instruction is handled by a different recipe.");
   case Instruction::UDiv:
   case Instruction::SDiv:
   case Instruction::SRem:
   case Instruction::URem:
   case Instruction::Add:
   case Instruction::FAdd:
   case Instruction::Sub:
   case Instruction::FSub:
   case Instruction::FNeg:
   case Instruction::Mul:
   case Instruction::FMul:
   case Instruction::FDiv:
   case Instruction::FRem:
   case Instruction::Shl:
   case Instruction::LShr:
   case Instruction::AShr:
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor: {
     // Just widen unops and binops.
     SmallVector<Value *, 2> Ops;
     for (VPValue *VPOp : operands())
       Ops.push_back(State.get(VPOp));

     Value *V = Builder.CreateNAryOp(Opcode, Ops);

     if (auto *VecOp = dyn_cast<Instruction>(V)) {
       applyFlags(*VecOp);
       applyMetadata(*VecOp);
     }

     // Use this vector value for all users of the original instruction.
     State.set(this, V);
     break;
   }
   case Instruction::ExtractValue: {
     assert(getNumOperands() == 2 && "expected single level extractvalue");
     Value *Op = State.get(getOperand(0));
     auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
     Value *Extract = Builder.CreateExtractValue(Op, CI->getZExtValue());
     State.set(this, Extract);
     break;
   }
   case Instruction::Freeze: {
     Value *Op = State.get(getOperand(0));
     Value *Freeze = Builder.CreateFreeze(Op);
     State.set(this, Freeze);
     break;
   }
   case Instruction::ICmp:
   case Instruction::FCmp: {
     // Widen compares. Generate vector compares.
     bool FCmp = Opcode == Instruction::FCmp;
     Value *A = State.get(getOperand(0));
     Value *B = State.get(getOperand(1));
     Value *C = nullptr;
     if (FCmp) {
       // Propagate fast math flags.
       C = Builder.CreateFCmpFMF(
           getPredicate(), A, B,
           dyn_cast_or_null<Instruction>(getUnderlyingValue()));
     } else {
       C = Builder.CreateICmp(getPredicate(), A, B);
     }
     if (auto *I = dyn_cast<Instruction>(C))
       applyMetadata(*I);
     State.set(this, C);
     break;
   }
   default:
     // This instruction is not vectorized by simple widening.
     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
                       << Instruction::getOpcodeName(Opcode));
     llvm_unreachable("Unhandled instruction!");
   } // end of switch.

 #if !defined(NDEBUG)
   // Verify that VPlan type inference results agree with the type of the
   // generated values.
   assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
              State.get(this)->getType() &&
          "inferred type and type from generated instructions do not match");
 #endif
 }

 InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
                                            VPCostContext &Ctx) const {
   switch (Opcode) {
   case Instruction::UDiv:
   case Instruction::SDiv:
   case Instruction::SRem:
   case Instruction::URem:
     // If the div/rem operation isn't safe to speculate and requires
     // predication, then the only way we can even create a vplan is to insert
     // a select on the second input operand to ensure we use the value of 1
     // for the inactive lanes. The select will be costed separately.
   case Instruction::FNeg:
   case Instruction::Add:
   case Instruction::FAdd:
   case Instruction::Sub:
   case Instruction::FSub:
   case Instruction::Mul:
   case Instruction::FMul:
   case Instruction::FDiv:
   case Instruction::FRem:
   case Instruction::Shl:
   case Instruction::LShr:
   case Instruction::AShr:
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
   case Instruction::Freeze:
   case Instruction::ExtractValue:
   case Instruction::ICmp:
   case Instruction::FCmp:
     return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN ";
   printAsOperand(O, SlotTracker);
   O << " = " << Instruction::getOpcodeName(Opcode);
   printFlags(O);
   printOperands(O, SlotTracker);
 }
 #endif

 void VPWidenCastRecipe::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
   /// Vectorize casts.
   assert(State.VF.isVector() && "Not vectorizing?");
   Type *DestTy = VectorType::get(getResultType(), State.VF);
   VPValue *Op = getOperand(0);
   Value *A = State.get(Op);
   Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
   State.set(this, Cast);
   if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
     applyFlags(*CastOp);
     applyMetadata(*CastOp);
   }
 }

 InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
   // TODO: In some cases, VPWidenCastRecipes are created but not considered in
   // the legacy cost model, including truncates/extends when evaluating a
   // reduction in a smaller type.
   if (!getUnderlyingValue())
     return 0;
   // Computes the CastContextHint from a recipes that may access memory.
   auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
     if (VF.isScalar())
       return TTI::CastContextHint::Normal;
     if (isa<VPInterleaveBase>(R))
       return TTI::CastContextHint::Interleave;
     if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R))
       return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
                                              : TTI::CastContextHint::Normal;
     const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
     if (WidenMemoryRecipe == nullptr)
       return TTI::CastContextHint::None;
     if (!WidenMemoryRecipe->isConsecutive())
       return TTI::CastContextHint::GatherScatter;
     if (WidenMemoryRecipe->isReverse())
       return TTI::CastContextHint::Reversed;
     if (WidenMemoryRecipe->isMasked())
       return TTI::CastContextHint::Masked;
     return TTI::CastContextHint::Normal;
   };

   VPValue *Operand = getOperand(0);
   TTI::CastContextHint CCH = TTI::CastContextHint::None;
   // For Trunc/FPTrunc, get the context from the only user.
   if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
       !hasMoreThanOneUniqueUser() && getNumUsers() > 0) {
     if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin()))
       CCH = ComputeCCH(StoreRecipe);
   }
   // For Z/Sext, get the context from the operand.
   else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
            Opcode == Instruction::FPExt) {
     if (Operand->isLiveIn())
       CCH = TTI::CastContextHint::Normal;
     else if (Operand->getDefiningRecipe())
       CCH = ComputeCCH(Operand->getDefiningRecipe());
   }

   auto *SrcTy =
       cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(Operand), VF));
   auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF));
   // Arm TTI will use the underlying instruction to determine the cost.
   return Ctx.TTI.getCastInstrCost(
       Opcode, DestTy, SrcTy, CCH, Ctx.CostKind,
       dyn_cast_if_present<Instruction>(getUnderlyingValue()));
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-CAST ";
   printAsOperand(O, SlotTracker);
   O << " = " << Instruction::getOpcodeName(Opcode);
   printFlags(O);
   printOperands(O, SlotTracker);
   O << " to " << *getResultType();
 }
 #endif

 InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
   return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
 }

 /// A helper function that returns an integer or floating-point constant with
 /// value C.
 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
                            : ConstantFP::get(Ty, C);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                           VPSlotTracker &SlotTracker) const {
   O << Indent;
   printAsOperand(O, SlotTracker);
   O << " = WIDEN-INDUCTION  ";
   printOperands(O, SlotTracker);

   if (auto *TI = getTruncInst())
     O << " (truncated to " << *TI->getType() << ")";
 }
 #endif

 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
   // The step may be defined by a recipe in the preheader (e.g. if it requires
   // SCEV expansion), but for the canonical induction the step is required to be
   // 1, which is represented as live-in.
   if (getStepValue()->getDefiningRecipe())
     return false;
   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
   auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
          getScalarType() == CanIV->getScalarType();
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
   O << Indent;
   printAsOperand(O, SlotTracker);
   O << " = DERIVED-IV ";
   getStartValue()->printAsOperand(O, SlotTracker);
   O << " + ";
   getOperand(1)->printAsOperand(O, SlotTracker);
   O << " * ";
   getStepValue()->printAsOperand(O, SlotTracker);
 }
 #endif

 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
   // Fast-math-flags propagate from the original induction instruction.
   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
   if (hasFastMathFlags())
     State.Builder.setFastMathFlags(getFastMathFlags());

   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
   /// variable on which to base the steps, \p Step is the size of the step.

   Value *BaseIV = State.get(getOperand(0), VPLane(0));
   Value *Step = State.get(getStepValue(), VPLane(0));
   IRBuilderBase &Builder = State.Builder;

   // Ensure step has the same type as that of scalar IV.
   Type *BaseIVTy = BaseIV->getType()->getScalarType();
   assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");

   // We build scalar steps for both integer and floating-point induction
   // variables. Here, we determine the kind of arithmetic we will perform.
   Instruction::BinaryOps AddOp;
   Instruction::BinaryOps MulOp;
   if (BaseIVTy->isIntegerTy()) {
     AddOp = Instruction::Add;
     MulOp = Instruction::Mul;
   } else {
     AddOp = InductionOpcode;
     MulOp = Instruction::FMul;
   }

   // Determine the number of scalars we need to generate for each unroll
   // iteration.
   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
   // Compute the scalar steps and save the results in State.
   Type *IntStepTy =
       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
   Type *VecIVTy = nullptr;
   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
   if (!FirstLaneOnly && State.VF.isScalable()) {
     VecIVTy = VectorType::get(BaseIVTy, State.VF);
     UnitStepVec =
         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
   }

   unsigned StartLane = 0;
   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
   if (State.Lane) {
     StartLane = State.Lane->getKnownLane();
     EndLane = StartLane + 1;
   }
   Value *StartIdx0;
   if (getUnrollPart(*this) == 0)
     StartIdx0 = ConstantInt::get(IntStepTy, 0);
   else {
     StartIdx0 = State.get(getOperand(2), true);
     if (getUnrollPart(*this) != 1) {
       StartIdx0 =
           Builder.CreateMul(StartIdx0, ConstantInt::get(StartIdx0->getType(),
                                                         getUnrollPart(*this)));
     }
     StartIdx0 = Builder.CreateSExtOrTrunc(StartIdx0, IntStepTy);
   }

   if (!FirstLaneOnly && State.VF.isScalable()) {
     auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
     auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
     if (BaseIVTy->isFloatingPointTy())
       InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
     auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
     auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
     State.set(this, Add);
     // It's useful to record the lane values too for the known minimum number
     // of elements so we do those below. This improves the code quality when
     // trying to extract the first element, for example.
   }

   if (BaseIVTy->isFloatingPointTy())
     StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);

   for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
     Value *StartIdx = Builder.CreateBinOp(
         AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
     // The step returned by `createStepForVF` is a runtime-evaluated value
     // when VF is scalable. Otherwise, it should be folded into a Constant.
     assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
            "Expected StartIdx to be folded to a constant when VF is not "
            "scalable");
     auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
     auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
     State.set(this, Add, VPLane(Lane));
   }
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
   O << Indent;
   printAsOperand(O, SlotTracker);
   O << " = SCALAR-STEPS ";
   printOperands(O, SlotTracker);
 }
 #endif

 void VPWidenGEPRecipe::execute(VPTransformState &State) {
   assert(State.VF.isVector() && "not widening");
   // Construct a vector GEP by widening the operands of the scalar GEP as
   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
   // results in a vector of pointers when at least one operand of the GEP
   // is vector-typed. Thus, to keep the representation compact, we only use
   // vector-typed operands for loop-varying values.

   if (areAllOperandsInvariant()) {
     // If we are vectorizing, but the GEP has only loop-invariant operands,
     // the GEP we build (by only using vector-typed operands for
     // loop-varying values) would be a scalar pointer. Thus, to ensure we
     // produce a vector of pointers, we need to either arbitrarily pick an
     // operand to broadcast, or broadcast a clone of the original GEP.
     // Here, we broadcast a clone of the original.
     //
     // TODO: If at some point we decide to scalarize instructions having
     //       loop-invariant operands, this special case will no longer be
     //       required. We would add the scalarization decision to
     //       collectLoopScalars() and teach getVectorValue() to broadcast
     //       the lane-zero scalar value.
     SmallVector<Value *> Ops;
     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
       Ops.push_back(State.get(getOperand(I), VPLane(0)));

     auto *NewGEP =
         State.Builder.CreateGEP(getSourceElementType(), Ops[0], drop_begin(Ops),
                                 "", getGEPNoWrapFlags());
     Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
     State.set(this, Splat);
   } else {
     // If the GEP has at least one loop-varying operand, we are sure to
     // produce a vector of pointers unless VF is scalar.
     // The pointer operand of the new GEP. If it's loop-invariant, we
     // won't broadcast it.
     auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());

     // Collect all the indices for the new GEP. If any index is
     // loop-invariant, we won't broadcast it.
     SmallVector<Value *, 4> Indices;
     for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
       VPValue *Operand = getOperand(I);
       Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
     }

     // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
     // but it should be a vector, otherwise.
     auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
                                            "", getGEPNoWrapFlags());
     assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
            "NewGEP is not a pointer vector");
     State.set(this, NewGEP);
   }
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
                              VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-GEP ";
   O << (isPointerLoopInvariant() ? "Inv" : "Var");
   for (size_t I = 0; I < getNumOperands() - 1; ++I)
     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";

   O << " ";
   printAsOperand(O, SlotTracker);
   O << " = getelementptr";
   printFlags(O);
   printOperands(O, SlotTracker);
 }
 #endif

 static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride,
                            unsigned CurrentPart, IRBuilderBase &Builder) {
   // Use i32 for the gep index type when the value is constant,
   // or query DataLayout for a more suitable index type otherwise.
   const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
   return !IsUnitStride || (IsScalable && (IsReverse || CurrentPart > 0))
              ? DL.getIndexType(Builder.getPtrTy(0))
              : Builder.getInt32Ty();
 }

 void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
   unsigned CurrentPart = getUnrollPart(*this);
   bool IsUnitStride = Stride == 1 || Stride == -1;
   Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true,
                                 IsUnitStride, CurrentPart, Builder);

   // The wide store needs to start at the last vector element.
   Value *RunTimeVF = State.get(getVFValue(), VPLane(0));
   if (IndexTy != RunTimeVF->getType())
     RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy);
   // NumElt = Stride * CurrentPart * RunTimeVF
   Value *NumElt = Builder.CreateMul(
       ConstantInt::get(IndexTy, Stride * (int64_t)CurrentPart), RunTimeVF);
   // LastLane = Stride * (RunTimeVF - 1)
   Value *LastLane = Builder.CreateSub(RunTimeVF, ConstantInt::get(IndexTy, 1));
   if (Stride != 1)
     LastLane = Builder.CreateMul(ConstantInt::get(IndexTy, Stride), LastLane);
   Value *Ptr = State.get(getOperand(0), VPLane(0));
   Value *ResultPtr =
       Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
   ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "",
                                 getGEPNoWrapFlags());

   State.set(this, ResultPtr, /*IsScalar*/ true);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent,
                                      VPSlotTracker &SlotTracker) const {
   O << Indent;
   printAsOperand(O, SlotTracker);
   O << " = vector-end-pointer";
   printFlags(O);
   printOperands(O, SlotTracker);
 }
 #endif

 void VPVectorPointerRecipe::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
   unsigned CurrentPart = getUnrollPart(*this);
   Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
                                 /*IsUnitStride*/ true, CurrentPart, Builder);
   Value *Ptr = State.get(getOperand(0), VPLane(0));

   Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
   Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Increment,
                                        "", getGEPNoWrapFlags());

   State.set(this, ResultPtr, /*IsScalar*/ true);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
   O << Indent;
   printAsOperand(O, SlotTracker);
   O << " = vector-pointer ";

   printOperands(O, SlotTracker);
 }
 #endif

 InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
                                            VPCostContext &Ctx) const {
   // Handle cases where only the first lane is used the same way as the legacy
   // cost model.
   if (vputils::onlyFirstLaneUsed(this))
     return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);

   Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
   Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
   return (getNumIncomingValues() - 1) *
          Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
                                     CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
   O << Indent << "BLEND ";
   printAsOperand(O, SlotTracker);
   O << " =";
   if (getNumIncomingValues() == 1) {
     // Not a User of any mask: not really blending, this is a
     // single-predecessor phi.
     O << " ";
     getIncomingValue(0)->printAsOperand(O, SlotTracker);
   } else {
     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
       O << " ";
       getIncomingValue(I)->printAsOperand(O, SlotTracker);
       if (I == 0)
         continue;
       O << "/";
       getMask(I)->printAsOperand(O, SlotTracker);
     }
   }
 }
 #endif

 void VPReductionRecipe::execute(VPTransformState &State) {
   assert(!State.Lane && "Reduction being replicated.");
   Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
   RecurKind Kind = getRecurrenceKind();
   assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
          "In-loop AnyOf reductions aren't currently supported");
   // Propagate the fast-math flags carried by the underlying instruction.
   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
   State.Builder.setFastMathFlags(getFastMathFlags());
   Value *NewVecOp = State.get(getVecOp());
   if (VPValue *Cond = getCondOp()) {
     Value *NewCond = State.get(Cond, State.VF.isScalar());
     VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
     Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();

     Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags());
     if (State.VF.isVector())
       Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);

     Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
     NewVecOp = Select;
   }
   Value *NewRed;
   Value *NextInChain;
   if (IsOrdered) {
     if (State.VF.isVector())
       NewRed =
           createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain);
     else
       NewRed = State.Builder.CreateBinOp(
           (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
           PrevInChain, NewVecOp);
     PrevInChain = NewRed;
     NextInChain = NewRed;
   } else {
     PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
     NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind);
     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
       NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
     else
       NextInChain = State.Builder.CreateBinOp(
           (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
           PrevInChain, NewRed);
   }
   State.set(this, NextInChain, /*IsScalar*/ true);
 }

 void VPReductionEVLRecipe::execute(VPTransformState &State) {
   assert(!State.Lane && "Reduction being replicated.");

   auto &Builder = State.Builder;
   // Propagate the fast-math flags carried by the underlying instruction.
   IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
   Builder.setFastMathFlags(getFastMathFlags());

   RecurKind Kind = getRecurrenceKind();
   Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
   Value *VecOp = State.get(getVecOp());
   Value *EVL = State.get(getEVL(), VPLane(0));

   Value *Mask;
   if (VPValue *CondOp = getCondOp())
     Mask = State.get(CondOp);
   else
     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());

   Value *NewRed;
   if (isOrdered()) {
     NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
   } else {
     NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
       NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
     else
       NewRed = Builder.CreateBinOp(
           (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), NewRed,
           Prev);
   }
   State.set(this, NewRed, /*IsScalar*/ true);
 }

 InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
   RecurKind RdxKind = getRecurrenceKind();
   Type *ElementTy = Ctx.Types.inferScalarType(this);
   auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
   unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
   FastMathFlags FMFs = getFastMathFlags();
   std::optional<FastMathFlags> OptionalFMF =
       ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;

   // TODO: Support any-of reductions.
   assert(
       (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
        ForceTargetInstructionCost.getNumOccurrences() > 0) &&
       "Any-of reduction not implemented in VPlan-based cost model currently.");

   // Note that TTI should model the cost of moving result to the scalar register
   // and the BinOp cost in the getMinMaxReductionCost().
   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
     Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
     return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
   }

   // Note that TTI should model the cost of moving result to the scalar register
   // and the BinOp cost in the getArithmeticReductionCost().
   return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
                                             Ctx.CostKind);
 }

 VPExpressionRecipe::VPExpressionRecipe(
     ExpressionTypes ExpressionType,
     ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
     : VPSingleDefRecipe(VPDef::VPExpressionSC, {}, {}),
       ExpressionRecipes(SetVector<VPSingleDefRecipe *>(
                             ExpressionRecipes.begin(), ExpressionRecipes.end())
                             .takeVector()),
       ExpressionType(ExpressionType) {
   assert(!ExpressionRecipes.empty() && "Nothing to combine?");
   assert(
       none_of(ExpressionRecipes,
               [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
       "expression cannot contain recipes with side-effects");

   // Maintain a copy of the expression recipes as a set of users.
   SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
   for (auto *R : ExpressionRecipes)
     ExpressionRecipesAsSetOfUsers.insert(R);

   // Recipes in the expression, except the last one, must only be used by
   // (other) recipes inside the expression. If there are other users, external
   // to the expression, use a clone of the recipe for external users.
   for (VPSingleDefRecipe *R : ExpressionRecipes) {
     if (R != ExpressionRecipes.back() &&
         any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
           return !ExpressionRecipesAsSetOfUsers.contains(U);
         })) {
       // There are users outside of the expression. Clone the recipe and use the
       // clone those external users.
       VPSingleDefRecipe *CopyForExtUsers = R->clone();
       R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers](
                                                 VPUser &U, unsigned) {
         return !ExpressionRecipesAsSetOfUsers.contains(&U);
       });
       CopyForExtUsers->insertBefore(R);
     }
     if (R->getParent())
       R->removeFromParent();
   }

   // Internalize all external operands to the expression recipes. To do so,
   // create new temporary VPValues for all operands defined by a recipe outside
   // the expression. The original operands are added as operands of the
   // VPExpressionRecipe itself.
   for (auto *R : ExpressionRecipes) {
     for (const auto &[Idx, Op] : enumerate(R->operands())) {
       auto *Def = Op->getDefiningRecipe();
       if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
         continue;
       addOperand(Op);
       LiveInPlaceholders.push_back(new VPValue());
       R->setOperand(Idx, LiveInPlaceholders.back());
     }
   }
 }

 void VPExpressionRecipe::decompose() {
   for (auto *R : ExpressionRecipes)
     R->insertBefore(this);

   for (const auto &[Idx, Op] : enumerate(operands()))
     LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);

   replaceAllUsesWith(ExpressionRecipes.back());
   ExpressionRecipes.clear();
 }

 InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
                                                 VPCostContext &Ctx) const {
   Type *RedTy = Ctx.Types.inferScalarType(this);
   auto *SrcVecTy = cast<VectorType>(
       toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
   assert(RedTy->isIntegerTy() &&
          "VPExpressionRecipe only supports integer types currently.");
   unsigned Opcode = RecurrenceDescriptor::getOpcode(
       cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
   switch (ExpressionType) {
   case ExpressionTypes::ExtendedReduction: {
     return Ctx.TTI.getExtendedReductionCost(
         Opcode,
         cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
             Instruction::ZExt,
         RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
   }
   case ExpressionTypes::MulAccReduction:
     return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
                                           Ctx.CostKind);

   case ExpressionTypes::ExtMulAccReduction:
     return Ctx.TTI.getMulAccReductionCost(
         cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
             Instruction::ZExt,
         Opcode, RedTy, SrcVecTy, Ctx.CostKind);
   }
   llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
 }

 bool VPExpressionRecipe::mayReadOrWriteMemory() const {
   return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) {
     return R->mayReadFromMemory() || R->mayWriteToMemory();
   });
 }

 bool VPExpressionRecipe::mayHaveSideEffects() const {
   assert(
       none_of(ExpressionRecipes,
               [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
       "expression cannot contain recipes with side-effects");
   return false;
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

 void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
                                VPSlotTracker &SlotTracker) const {
   O << Indent << "EXPRESSION ";
   printAsOperand(O, SlotTracker);
   O << " = ";
   auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
   unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());

   switch (ExpressionType) {
   case ExpressionTypes::ExtendedReduction: {
     getOperand(1)->printAsOperand(O, SlotTracker);
     O << " +";
     O << " reduce." << Instruction::getOpcodeName(Opcode) << " (";
     getOperand(0)->printAsOperand(O, SlotTracker);
     Red->printFlags(O);

     auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
     O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
       << *Ext0->getResultType();
     if (Red->isConditional()) {
       O << ", ";
       Red->getCondOp()->printAsOperand(O, SlotTracker);
     }
     O << ")";
     break;
   }
   case ExpressionTypes::MulAccReduction:
   case ExpressionTypes::ExtMulAccReduction: {
     getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
     O << " + ";
     O << "reduce."
       << Instruction::getOpcodeName(
              RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
       << " (";
     O << "mul";
     bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
     auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
                                                : ExpressionRecipes[0]);
     Mul->printFlags(O);
     if (IsExtended)
       O << "(";
     getOperand(0)->printAsOperand(O, SlotTracker);
     if (IsExtended) {
       auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
       O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
         << *Ext0->getResultType() << "), (";
     } else {
       O << ", ";
     }
     getOperand(1)->printAsOperand(O, SlotTracker);
     if (IsExtended) {
       auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
       O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
         << *Ext1->getResultType() << ")";
     }
     if (Red->isConditional()) {
       O << ", ";
       Red->getCondOp()->printAsOperand(O, SlotTracker);
     }
     O << ")";
     break;
   }
   }
 }

 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
   O << Indent << "REDUCE ";
   printAsOperand(O, SlotTracker);
   O << " = ";
   getChainOp()->printAsOperand(O, SlotTracker);
   O << " +";
   printFlags(O);
   O << " reduce."
     << Instruction::getOpcodeName(
            RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
     << " (";
   getVecOp()->printAsOperand(O, SlotTracker);
   if (isConditional()) {
     O << ", ";
     getCondOp()->printAsOperand(O, SlotTracker);
   }
   O << ")";
 }

 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
                                  VPSlotTracker &SlotTracker) const {
   O << Indent << "REDUCE ";
   printAsOperand(O, SlotTracker);
   O << " = ";
   getChainOp()->printAsOperand(O, SlotTracker);
   O << " +";
   printFlags(O);
   O << " vp.reduce."
     << Instruction::getOpcodeName(
            RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
     << " (";
   getVecOp()->printAsOperand(O, SlotTracker);
   O << ", ";
   getEVL()->printAsOperand(O, SlotTracker);
   if (isConditional()) {
     O << ", ";
     getCondOp()->printAsOperand(O, SlotTracker);
   }
   O << ")";
 }

 #endif

 /// A helper function to scalarize a single Instruction in the innermost loop.
 /// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
 /// operands from \p RepRecipe instead of \p Instr's operands.
 static void scalarizeInstruction(const Instruction *Instr,
                                  VPReplicateRecipe *RepRecipe,
                                  const VPLane &Lane, VPTransformState &State) {
   assert((!Instr->getType()->isAggregateType() ||
           canVectorizeTy(Instr->getType())) &&
          "Expected vectorizable or non-aggregate type.");

   // Does this instruction return a value ?
   bool IsVoidRetTy = Instr->getType()->isVoidTy();

   Instruction *Cloned = Instr->clone();
   if (!IsVoidRetTy) {
     Cloned->setName(Instr->getName() + ".cloned");
     Type *ResultTy = State.TypeAnalysis.inferScalarType(RepRecipe);
     // The operands of the replicate recipe may have been narrowed, resulting in
     // a narrower result type. Update the type of the cloned instruction to the
     // correct type.
     if (ResultTy != Cloned->getType())
       Cloned->mutateType(ResultTy);
   }

   RepRecipe->applyFlags(*Cloned);
   RepRecipe->applyMetadata(*Cloned);

   if (RepRecipe->hasPredicate())
     cast<CmpInst>(Cloned)->setPredicate(RepRecipe->getPredicate());

   if (auto DL = RepRecipe->getDebugLoc())
     State.setDebugLocFrom(DL);

   // Replace the operands of the cloned instructions with their scalar
   // equivalents in the new loop.
   for (const auto &I : enumerate(RepRecipe->operands())) {
     auto InputLane = Lane;
     VPValue *Operand = I.value();
     if (vputils::isSingleScalar(Operand))
       InputLane = VPLane::getFirstLane();
     Cloned->setOperand(I.index(), State.get(Operand, InputLane));
   }

   // Place the cloned scalar in the new loop.
   State.Builder.Insert(Cloned);

   State.set(RepRecipe, Cloned, Lane);

   // If we just cloned a new assumption, add it the assumption cache.
   if (auto *II = dyn_cast<AssumeInst>(Cloned))
     State.AC->registerAssumption(II);

   assert(
       (RepRecipe->getParent()->getParent() ||
        !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
        all_of(RepRecipe->operands(),
               [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
       "Expected a recipe is either within a region or all of its operands "
       "are defined outside the vectorized region.");
 }

 void VPReplicateRecipe::execute(VPTransformState &State) {
   Instruction *UI = getUnderlyingInstr();

   if (!State.Lane) {
     assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
                              "must have already been unrolled");
     scalarizeInstruction(UI, this, VPLane(0), State);
     return;
   }

   assert((State.VF.isScalar() || !isSingleScalar()) &&
          "uniform recipe shouldn't be predicated");
   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
   scalarizeInstruction(UI, this, *State.Lane, State);
   // Insert scalar instance packing it into a vector.
   if (State.VF.isVector() && shouldPack()) {
     Value *WideValue =
         State.Lane->isFirstLane()
             ? PoisonValue::get(VectorType::get(UI->getType(), State.VF))
             : State.get(this);
     State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
                                                         *State.Lane));
   }
 }

 bool VPReplicateRecipe::shouldPack() const {
   // Find if the recipe is used by a widened recipe via an intervening
   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
   return any_of(users(), [](const VPUser *U) {
     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
       return !vputils::onlyScalarValuesUsed(PredR);
     return false;
   });
 }

 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
   Instruction *UI = cast<Instruction>(getUnderlyingValue());
   // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
   // transform, avoid computing their cost multiple times for now.
   Ctx.SkipCostComputation.insert(UI);

   switch (UI->getOpcode()) {
   case Instruction::GetElementPtr:
     // We mark this instruction as zero-cost because the cost of GEPs in
     // vectorized code depends on whether the corresponding memory instruction
     // is scalarized or not. Therefore, we handle GEPs with the memory
     // instruction cost.
     return 0;
   case Instruction::Call: {
     auto *CalledFn =
         cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue());

     SmallVector<const VPValue *> ArgOps(drop_end(operands()));
     SmallVector<Type *, 4> Tys;
     for (const VPValue *ArgOp : ArgOps)
       Tys.push_back(Ctx.Types.inferScalarType(ArgOp));

     if (CalledFn->isIntrinsic())
       // Various pseudo-intrinsics with costs of 0 are scalarized instead of
       // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
       switch (CalledFn->getIntrinsicID()) {
       case Intrinsic::assume:
       case Intrinsic::lifetime_end:
       case Intrinsic::lifetime_start:
       case Intrinsic::sideeffect:
       case Intrinsic::pseudoprobe:
       case Intrinsic::experimental_noalias_scope_decl: {
         assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
                                     ElementCount::getFixed(1), Ctx) == 0 &&
                "scalarizing intrinsic should be free");
         return InstructionCost(0);
       }
       default:
         break;
       }

     Type *ResultTy = Ctx.Types.inferScalarType(this);
     InstructionCost ScalarCallCost =
         Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
     if (isSingleScalar()) {
       if (CalledFn->isIntrinsic())
         ScalarCallCost = std::min(
             ScalarCallCost,
             getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
                                  ElementCount::getFixed(1), Ctx));
       return ScalarCallCost;
     }

     if (VF.isScalable())
       return InstructionCost::getInvalid();

     return ScalarCallCost * VF.getFixedValue() +
            Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
   }
   case Instruction::Add:
   case Instruction::Sub:
   case Instruction::FAdd:
   case Instruction::FSub:
   case Instruction::Mul:
   case Instruction::FMul:
   case Instruction::FDiv:
   case Instruction::FRem:
   case Instruction::Shl:
   case Instruction::LShr:
   case Instruction::AShr:
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
   case Instruction::ICmp:
   case Instruction::FCmp:
     return getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1),
                                       Ctx) *
            (isSingleScalar() ? 1 : VF.getFixedValue());
   case Instruction::SDiv:
   case Instruction::UDiv:
   case Instruction::SRem:
   case Instruction::URem: {
     InstructionCost ScalarCost =
         getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1), Ctx);
     if (isSingleScalar())
       return ScalarCost;

     ScalarCost = ScalarCost * VF.getFixedValue() +
                  Ctx.getScalarizationOverhead(Ctx.Types.inferScalarType(this),
                                               to_vector(operands()), VF);
     // If the recipe is not predicated (i.e. not in a replicate region), return
     // the scalar cost. Otherwise handle predicated cost.
     if (!getParent()->getParent()->isReplicator())
       return ScalarCost;

     // Account for the phi nodes that we will create.
     ScalarCost += VF.getFixedValue() *
                   Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
     // Scale the cost by the probability of executing the predicated blocks.
     // This assumes the predicated block for each vector lane is equally
     // likely.
     ScalarCost /= getPredBlockCostDivisor(Ctx.CostKind);
     return ScalarCost;
   }
   case Instruction::Load:
   case Instruction::Store: {
     if (isSingleScalar()) {
       bool IsLoad = UI->getOpcode() == Instruction::Load;
       Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
       Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
       const Align Alignment = getLoadStoreAlignment(UI);
       unsigned AS = getLoadStoreAddressSpace(UI);
       TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
       InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
           UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
       return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
                                    ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
     }
     // TODO: See getMemInstScalarizationCost for how to handle replicating and
     // predicated cases.
     break;
   }
   }

   return Ctx.getLegacyCost(UI, VF);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
   O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");

   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
     printAsOperand(O, SlotTracker);
     O << " = ";
   }
   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
     O << "call";
     printFlags(O);
     O << "@" << CB->getCalledFunction()->getName() << "(";
     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
                     O, [&O, &SlotTracker](VPValue *Op) {
                       Op->printAsOperand(O, SlotTracker);
                     });
     O << ")";
   } else {
     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
     printFlags(O);
     printOperands(O, SlotTracker);
   }

   if (shouldPack())
     O << " (S->V)";
 }
 #endif

 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
   assert(State.Lane && "Branch on Mask works only on single instance.");

   VPValue *BlockInMask = getOperand(0);
   Value *ConditionBit = State.get(BlockInMask, *State.Lane);

   // Replace the temporary unreachable terminator with a new conditional branch,
   // whose two destinations will be set later when they are created.
   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
   assert(isa<UnreachableInst>(CurrentTerminator) &&
          "Expected to replace unreachable terminator with conditional branch.");
   auto CondBr =
       State.Builder.CreateCondBr(ConditionBit, State.CFG.PrevBB, nullptr);
   CondBr->setSuccessor(0, nullptr);
   CurrentTerminator->eraseFromParent();
 }

 InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
                                                   VPCostContext &Ctx) const {
   // The legacy cost model doesn't assign costs to branches for individual
   // replicate regions. Match the current behavior in the VPlan cost model for
   // now.
   return 0;
 }

 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
   assert(State.Lane && "Predicated instruction PHI works per instance.");
   Instruction *ScalarPredInst =
       cast<Instruction>(State.get(getOperand(0), *State.Lane));
   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
   assert(PredicatingBB && "Predicated block has no single predecessor.");
   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
          "operand must be VPReplicateRecipe");

   // By current pack/unpack logic we need to generate only a single phi node: if
   // a vector value for the predicated instruction exists at this point it means
   // the instruction has vector users only, and a phi for the vector value is
   // needed. In this case the recipe of the predicated instruction is marked to
   // also do that packing, thereby "hoisting" the insert-element sequence.
   // Otherwise, a phi node for the scalar value is needed.
   if (State.hasVectorValue(getOperand(0))) {
     Value *VectorValue = State.get(getOperand(0));
     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
     if (State.hasVectorValue(this))
       State.reset(this, VPhi);
     else
       State.set(this, VPhi);
     // NOTE: Currently we need to update the value of the operand, so the next
     // predicated iteration inserts its generated value in the correct vector.
     State.reset(getOperand(0), VPhi);
   } else {
     if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane())
       return;

     Type *PredInstType = State.TypeAnalysis.inferScalarType(getOperand(0));
     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
                      PredicatingBB);
     Phi->addIncoming(ScalarPredInst, PredicatedBB);
     if (State.hasScalarValue(this, *State.Lane))
       State.reset(this, Phi, *State.Lane);
     else
       State.set(this, Phi, *State.Lane);
     // NOTE: Currently we need to update the value of the operand, so the next
     // predicated iteration inserts its generated value in the correct vector.
     State.reset(getOperand(0), Phi, *State.Lane);
   }
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
                                 VPSlotTracker &SlotTracker) const {
   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
   printAsOperand(O, SlotTracker);
   O << " = ";
   printOperands(O, SlotTracker);
 }
 #endif

 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
                                                  VPCostContext &Ctx) const {
   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
   const Align Alignment = getLoadStoreAlignment(&Ingredient);
   unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
                     ->getAddressSpace();
   unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
                         ? Instruction::Load
                         : Instruction::Store;

   if (!Consecutive) {
     // TODO: Using the original IR may not be accurate.
     // Currently, ARM will use the underlying IR to calculate gather/scatter
     // instruction cost.
     assert(!Reverse &&
            "Inconsecutive memory access should not have the order.");

     const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
     Type *PtrTy = Ptr->getType();

     // If the address value is uniform across all lanes, then the address can be
     // calculated with scalar type and broadcast.
     if (!vputils::isSingleScalar(getAddr()))
       PtrTy = toVectorTy(PtrTy, VF);

     return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
                                              Ctx.CostKind) +
            Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment,
                                           Ctx.CostKind, &Ingredient);
   }

   InstructionCost Cost = 0;
   if (IsMasked) {
     Cost +=
         Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind);
   } else {
     TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
         isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this) ? getOperand(0)
                                                            : getOperand(1));
     Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
                                     OpInfo, &Ingredient);
   }
   if (!Reverse)
     return Cost;

   return Cost += Ctx.TTI.getShuffleCost(
              TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
              cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
 }

 void VPWidenLoadRecipe::execute(VPTransformState &State) {
   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
   const Align Alignment = getLoadStoreAlignment(&Ingredient);
   bool CreateGather = !isConsecutive();

   auto &Builder = State.Builder;
   Value *Mask = nullptr;
   if (auto *VPMask = getMask()) {
     // Mask reversal is only needed for non-all-one (null) masks, as reverse
     // of a null all-one mask is a null mask.
     Mask = State.get(VPMask);
     if (isReverse())
       Mask = Builder.CreateVectorReverse(Mask, "reverse");
   }

   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
   Value *NewLI;
   if (CreateGather) {
     NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
                                        "wide.masked.gather");
   } else if (Mask) {
     NewLI =
         Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
                                  PoisonValue::get(DataTy), "wide.masked.load");
   } else {
     NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
   }
   applyMetadata(*cast<Instruction>(NewLI));
   if (Reverse)
     NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
   State.set(this, NewLI);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN ";
   printAsOperand(O, SlotTracker);
   O << " = load ";
   printOperands(O, SlotTracker);
 }
 #endif

 /// Use all-true mask for reverse rather than actual mask, as it avoids a
 /// dependence w/o affecting the result.
 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
                                      Value *EVL, const Twine &Name) {
   VectorType *ValTy = cast<VectorType>(Operand->getType());
   Value *AllTrueMask =
       Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
   return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
                                  {Operand, AllTrueMask, EVL}, nullptr, Name);
 }

 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
   const Align Alignment = getLoadStoreAlignment(&Ingredient);
   bool CreateGather = !isConsecutive();

   auto &Builder = State.Builder;
   CallInst *NewLI;
   Value *EVL = State.get(getEVL(), VPLane(0));
   Value *Addr = State.get(getAddr(), !CreateGather);
   Value *Mask = nullptr;
   if (VPValue *VPMask = getMask()) {
     Mask = State.get(VPMask);
     if (isReverse())
       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
   } else {
     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
   }

   if (CreateGather) {
     NewLI =
         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
                                 nullptr, "wide.masked.gather");
   } else {
     NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
                                     {Addr, Mask, EVL}, nullptr, "vp.op.load");
   }
   NewLI->addParamAttr(
       0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
   applyMetadata(*NewLI);
   Instruction *Res = NewLI;
   if (isReverse())
     Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
   State.set(this, Res);
 }

 InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
                                                   VPCostContext &Ctx) const {
   if (!Consecutive || IsMasked)
     return VPWidenMemoryRecipe::computeCost(VF, Ctx);

   // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
   // here because the EVL recipes using EVL to replace the tail mask. But in the
   // legacy model, it will always calculate the cost of mask.
   // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
   // don't need to compare to the legacy cost model.
   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
   const Align Alignment = getLoadStoreAlignment(&Ingredient);
   unsigned AS = getLoadStoreAddressSpace(&Ingredient);
   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
       Instruction::Load, Ty, Alignment, AS, Ctx.CostKind);
   if (!Reverse)
     return Cost;

   return Cost + Ctx.TTI.getShuffleCost(
                     TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
                     cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
                                  VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN ";
   printAsOperand(O, SlotTracker);
   O << " = vp.load ";
   printOperands(O, SlotTracker);
 }
 #endif

 void VPWidenStoreRecipe::execute(VPTransformState &State) {
   VPValue *StoredVPValue = getStoredValue();
   bool CreateScatter = !isConsecutive();
   const Align Alignment = getLoadStoreAlignment(&Ingredient);

   auto &Builder = State.Builder;

   Value *Mask = nullptr;
   if (auto *VPMask = getMask()) {
     // Mask reversal is only needed for non-all-one (null) masks, as reverse
     // of a null all-one mask is a null mask.
     Mask = State.get(VPMask);
     if (isReverse())
       Mask = Builder.CreateVectorReverse(Mask, "reverse");
   }

   Value *StoredVal = State.get(StoredVPValue);
   if (isReverse()) {
     // If we store to reverse consecutive memory locations, then we need
     // to reverse the order of elements in the stored value.
     StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
     // We don't want to update the value in the map as it might be used in
     // another expression. So don't call resetVectorValue(StoredVal).
   }
   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
   Instruction *NewSI = nullptr;
   if (CreateScatter)
     NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
   else if (Mask)
     NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
   else
     NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
   applyMetadata(*NewSI);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
                                VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN store ";
   printOperands(O, SlotTracker);
 }
 #endif

 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
   VPValue *StoredValue = getStoredValue();
   bool CreateScatter = !isConsecutive();
   const Align Alignment = getLoadStoreAlignment(&Ingredient);

   auto &Builder = State.Builder;

   CallInst *NewSI = nullptr;
   Value *StoredVal = State.get(StoredValue);
   Value *EVL = State.get(getEVL(), VPLane(0));
   if (isReverse())
     StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
   Value *Mask = nullptr;
   if (VPValue *VPMask = getMask()) {
     Mask = State.get(VPMask);
     if (isReverse())
       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
   } else {
     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
   }
   Value *Addr = State.get(getAddr(), !CreateScatter);
   if (CreateScatter) {
     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
                                     Intrinsic::vp_scatter,
                                     {StoredVal, Addr, Mask, EVL});
   } else {
     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
                                     Intrinsic::vp_store,
                                     {StoredVal, Addr, Mask, EVL});
   }
   NewSI->addParamAttr(
       1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
   applyMetadata(*NewSI);
 }

 InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
                                                    VPCostContext &Ctx) const {
   if (!Consecutive || IsMasked)
     return VPWidenMemoryRecipe::computeCost(VF, Ctx);

   // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
   // here because the EVL recipes using EVL to replace the tail mask. But in the
   // legacy model, it will always calculate the cost of mask.
   // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
   // don't need to compare to the legacy cost model.
   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
   const Align Alignment = getLoadStoreAlignment(&Ingredient);
   unsigned AS = getLoadStoreAddressSpace(&Ingredient);
   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
       Instruction::Store, Ty, Alignment, AS, Ctx.CostKind);
   if (!Reverse)
     return Cost;

   return Cost + Ctx.TTI.getShuffleCost(
                     TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
                     cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN vp.store ";
   printOperands(O, SlotTracker);
 }
 #endif

 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
                                      VectorType *DstVTy, const DataLayout &DL) {
   // Verify that V is a vector type with same number of elements as DstVTy.
   auto VF = DstVTy->getElementCount();
   auto *SrcVecTy = cast<VectorType>(V->getType());
   assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
   Type *SrcElemTy = SrcVecTy->getElementType();
   Type *DstElemTy = DstVTy->getElementType();
   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
          "Vector elements must have same size");

   // Do a direct cast if element types are castable.
   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
     return Builder.CreateBitOrPointerCast(V, DstVTy);
   }
   // V cannot be directly casted to desired vector type.
   // May happen when V is a floating point vector but DstVTy is a vector of
   // pointers or vice-versa. Handle this using a two-step bitcast using an
   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
          "Only one type should be a pointer type");
   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
          "Only one type should be a floating point type");
   Type *IntTy =
       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
   auto *VecIntTy = VectorType::get(IntTy, VF);
   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
 }

 /// Return a vector containing interleaved elements from multiple
 /// smaller input vectors.
 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
                                 const Twine &Name) {
   unsigned Factor = Vals.size();
   assert(Factor > 1 && "Tried to interleave invalid number of vectors");

   VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
 #ifndef NDEBUG
   for (Value *Val : Vals)
     assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
 #endif

   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
   // must use intrinsics to interleave.
   if (VecTy->isScalableTy()) {
     assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
     return Builder.CreateVectorInterleave(Vals, Name);
   }

   // Fixed length. Start by concatenating all vectors into a wide vector.
   Value *WideVec = concatenateVectors(Builder, Vals);

   // Interleave the elements into the wide vector.
   const unsigned NumElts = VecTy->getElementCount().getFixedValue();
   return Builder.CreateShuffleVector(
       WideVec, createInterleaveMask(NumElts, Factor), Name);
 }

 // Try to vectorize the interleave group that \p Instr belongs to.
 //
 // E.g. Translate following interleaved load group (factor = 3):
 //   for (i = 0; i < N; i+=3) {
 //     R = Pic[i];             // Member of index 0
 //     G = Pic[i+1];           // Member of index 1
 //     B = Pic[i+2];           // Member of index 2
 //     ... // do something to R, G, B
 //   }
 // To:
 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
 //
 // Or translate following interleaved store group (factor = 3):
 //   for (i = 0; i < N; i+=3) {
 //     ... do something to R, G, B
 //     Pic[i]   = R;           // Member of index 0
 //     Pic[i+1] = G;           // Member of index 1
 //     Pic[i+2] = B;           // Member of index 2
 //   }
 // To:
 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
 void VPInterleaveRecipe::execute(VPTransformState &State) {
   assert(!State.Lane && "Interleave group being replicated.");
   assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
          "Masking gaps for scalable vectors is not yet supported.");
   const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
   Instruction *Instr = Group->getInsertPos();

   // Prepare for the vector type of the interleaved load/store.
   Type *ScalarTy = getLoadStoreType(Instr);
   unsigned InterleaveFactor = Group->getFactor();
   auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);

   VPValue *BlockInMask = getMask();
   VPValue *Addr = getAddr();
   Value *ResAddr = State.get(Addr, VPLane(0));

   auto CreateGroupMask = [&BlockInMask, &State,
                           &InterleaveFactor](Value *MaskForGaps) -> Value * {
     if (State.VF.isScalable()) {
       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
       assert(InterleaveFactor <= 8 &&
              "Unsupported deinterleave factor for scalable vectors");
       auto *ResBlockInMask = State.get(BlockInMask);
       SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
       return interleaveVectors(State.Builder, Ops, "interleaved.mask");
     }

     if (!BlockInMask)
       return MaskForGaps;

     Value *ResBlockInMask = State.get(BlockInMask);
     Value *ShuffledMask = State.Builder.CreateShuffleVector(
         ResBlockInMask,
         createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
         "interleaved.mask");
     return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
                                                    ShuffledMask, MaskForGaps)
                        : ShuffledMask;
   };

   const DataLayout &DL = Instr->getDataLayout();
   // Vectorize the interleaved load group.
   if (isa<LoadInst>(Instr)) {
     Value *MaskForGaps = nullptr;
     if (needsMaskForGaps()) {
       MaskForGaps =
           createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
       assert(MaskForGaps && "Mask for Gaps is required but it is null");
     }

     Instruction *NewLoad;
     if (BlockInMask || MaskForGaps) {
       Value *GroupMask = CreateGroupMask(MaskForGaps);
       Value *PoisonVec = PoisonValue::get(VecTy);
       NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
                                                Group->getAlign(), GroupMask,
                                                PoisonVec, "wide.masked.vec");
     } else
       NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
                                                 Group->getAlign(), "wide.vec");
     applyMetadata(*NewLoad);
     // TODO: Also manage existing metadata using VPIRMetadata.
     Group->addMetadata(NewLoad);

     ArrayRef<VPValue *> VPDefs = definedValues();
     if (VecTy->isScalableTy()) {
       // Scalable vectors cannot use arbitrary shufflevectors (only splats),
       // so must use intrinsics to deinterleave.
       assert(InterleaveFactor <= 8 &&
              "Unsupported deinterleave factor for scalable vectors");
       NewLoad = State.Builder.CreateIntrinsic(
           Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
           NewLoad->getType(), NewLoad,
           /*FMFSource=*/nullptr, "strided.vec");
     }

     auto CreateStridedVector = [&InterleaveFactor, &State,
                                 &NewLoad](unsigned Index) -> Value * {
       assert(Index < InterleaveFactor && "Illegal group index");
       if (State.VF.isScalable())
         return State.Builder.CreateExtractValue(NewLoad, Index);

       // For fixed length VF, use shuffle to extract the sub-vectors from the
       // wide load.
       auto StrideMask =
           createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue());
       return State.Builder.CreateShuffleVector(NewLoad, StrideMask,
                                                "strided.vec");
     };

     for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
       Instruction *Member = Group->getMember(I);

       // Skip the gaps in the group.
       if (!Member)
         continue;

       Value *StridedVec = CreateStridedVector(I);

       // If this member has different type, cast the result type.
       if (Member->getType() != ScalarTy) {
         VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
         StridedVec =
             createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
       }

       if (Group->isReverse())
         StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");

       State.set(VPDefs[J], StridedVec);
       ++J;
     }
     return;
   }

   // The sub vector type for current instruction.
   auto *SubVT = VectorType::get(ScalarTy, State.VF);

   // Vectorize the interleaved store group.
   Value *MaskForGaps =
       createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
   assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
          "Mismatch between NeedsMaskForGaps and MaskForGaps");
   ArrayRef<VPValue *> StoredValues = getStoredValues();
   // Collect the stored vector from each member.
   SmallVector<Value *, 4> StoredVecs;
   unsigned StoredIdx = 0;
   for (unsigned i = 0; i < InterleaveFactor; i++) {
     assert((Group->getMember(i) || MaskForGaps) &&
            "Fail to get a member from an interleaved store group");
     Instruction *Member = Group->getMember(i);

     // Skip the gaps in the group.
     if (!Member) {
       Value *Undef = PoisonValue::get(SubVT);
       StoredVecs.push_back(Undef);
       continue;
     }

     Value *StoredVec = State.get(StoredValues[StoredIdx]);
     ++StoredIdx;

     if (Group->isReverse())
       StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");

     // If this member has different type, cast it to a unified type.

     if (StoredVec->getType() != SubVT)
       StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);

     StoredVecs.push_back(StoredVec);
   }

   // Interleave all the smaller vectors into one wider vector.
   Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
   Instruction *NewStoreInstr;
   if (BlockInMask || MaskForGaps) {
     Value *GroupMask = CreateGroupMask(MaskForGaps);
     NewStoreInstr = State.Builder.CreateMaskedStore(
         IVec, ResAddr, Group->getAlign(), GroupMask);
   } else
     NewStoreInstr =
         State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());

   applyMetadata(*NewStoreInstr);
   // TODO: Also manage existing metadata using VPIRMetadata.
   Group->addMetadata(NewStoreInstr);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
                                VPSlotTracker &SlotTracker) const {
   const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
   IG->getInsertPos()->printAsOperand(O, false);
   O << ", ";
   getAddr()->printAsOperand(O, SlotTracker);
   VPValue *Mask = getMask();
   if (Mask) {
     O << ", ";
     Mask->printAsOperand(O, SlotTracker);
   }

   unsigned OpIdx = 0;
   for (unsigned i = 0; i < IG->getFactor(); ++i) {
     if (!IG->getMember(i))
       continue;
     if (getNumStoreOperands() > 0) {
       O << "\n" << Indent << "  store ";
       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
       O << " to index " << i;
     } else {
       O << "\n" << Indent << "  ";
       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
       O << " = load from index " << i;
     }
     ++OpIdx;
   }
 }
 #endif

 void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
   assert(!State.Lane && "Interleave group being replicated.");
   assert(State.VF.isScalable() &&
          "Only support scalable VF for EVL tail-folding.");
   assert(!needsMaskForGaps() &&
          "Masking gaps for scalable vectors is not yet supported.");
   const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
   Instruction *Instr = Group->getInsertPos();

   // Prepare for the vector type of the interleaved load/store.
   Type *ScalarTy = getLoadStoreType(Instr);
   unsigned InterleaveFactor = Group->getFactor();
   assert(InterleaveFactor <= 8 &&
          "Unsupported deinterleave/interleave factor for scalable vectors");
   ElementCount WideVF = State.VF * InterleaveFactor;
   auto *VecTy = VectorType::get(ScalarTy, WideVF);

   VPValue *Addr = getAddr();
   Value *ResAddr = State.get(Addr, VPLane(0));
   Value *EVL = State.get(getEVL(), VPLane(0));
   Value *InterleaveEVL = State.Builder.CreateMul(
       EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
       /* NUW= */ true, /* NSW= */ true);
   LLVMContext &Ctx = State.Builder.getContext();

   Value *GroupMask = nullptr;
   if (VPValue *BlockInMask = getMask()) {
     SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
     GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
   } else {
     GroupMask =
         State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
   }

   // Vectorize the interleaved load group.
   if (isa<LoadInst>(Instr)) {
     CallInst *NewLoad = State.Builder.CreateIntrinsic(
         VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
         "wide.vp.load");
     NewLoad->addParamAttr(0,
                           Attribute::getWithAlignment(Ctx, Group->getAlign()));

     applyMetadata(*NewLoad);
     // TODO: Also manage existing metadata using VPIRMetadata.
     Group->addMetadata(NewLoad);

     // Scalable vectors cannot use arbitrary shufflevectors (only splats),
     // so must use intrinsics to deinterleave.
     NewLoad = State.Builder.CreateIntrinsic(
         Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
         NewLoad->getType(), NewLoad,
         /*FMFSource=*/nullptr, "strided.vec");

     const DataLayout &DL = Instr->getDataLayout();
     for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
       Instruction *Member = Group->getMember(I);
       // Skip the gaps in the group.
       if (!Member)
         continue;

       Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
       // If this member has different type, cast the result type.
       if (Member->getType() != ScalarTy) {
         VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
         StridedVec =
             createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
       }

       State.set(getVPValue(J), StridedVec);
       ++J;
     }
     return;
   } // End for interleaved load.

   // The sub vector type for current instruction.
   auto *SubVT = VectorType::get(ScalarTy, State.VF);
   // Vectorize the interleaved store group.
   ArrayRef<VPValue *> StoredValues = getStoredValues();
   // Collect the stored vector from each member.
   SmallVector<Value *, 4> StoredVecs;
   const DataLayout &DL = Instr->getDataLayout();
   for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
     Instruction *Member = Group->getMember(I);
     // Skip the gaps in the group.
     if (!Member) {
       StoredVecs.push_back(PoisonValue::get(SubVT));
       continue;
     }

     Value *StoredVec = State.get(StoredValues[StoredIdx]);
     // If this member has different type, cast it to a unified type.
     if (StoredVec->getType() != SubVT)
       StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);

     StoredVecs.push_back(StoredVec);
     ++StoredIdx;
   }

   // Interleave all the smaller vectors into one wider vector.
   Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
   CallInst *NewStore =
       State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
                                     {IVec, ResAddr, GroupMask, InterleaveEVL});
   NewStore->addParamAttr(1,
                          Attribute::getWithAlignment(Ctx, Group->getAlign()));

   applyMetadata(*NewStore);
   // TODO: Also manage existing metadata using VPIRMetadata.
   Group->addMetadata(NewStore);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPInterleaveEVLRecipe::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
   const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
   IG->getInsertPos()->printAsOperand(O, false);
   O << ", ";
   getAddr()->printAsOperand(O, SlotTracker);
   O << ", ";
   getEVL()->printAsOperand(O, SlotTracker);
   if (VPValue *Mask = getMask()) {
     O << ", ";
     Mask->printAsOperand(O, SlotTracker);
   }

   unsigned OpIdx = 0;
   for (unsigned i = 0; i < IG->getFactor(); ++i) {
     if (!IG->getMember(i))
       continue;
     if (getNumStoreOperands() > 0) {
       O << "\n" << Indent << "  vp.store ";
       getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker);
       O << " to index " << i;
     } else {
       O << "\n" << Indent << "  ";
       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
       O << " = vp.load from index " << i;
     }
     ++OpIdx;
   }
 }
 #endif

 InstructionCost VPInterleaveBase::computeCost(ElementCount VF,
                                               VPCostContext &Ctx) const {
   Instruction *InsertPos = getInsertPos();
   // Find the VPValue index of the interleave group. We need to skip gaps.
   unsigned InsertPosIdx = 0;
   for (unsigned Idx = 0; IG->getFactor(); ++Idx)
     if (auto *Member = IG->getMember(Idx)) {
       if (Member == InsertPos)
         break;
       InsertPosIdx++;
     }
   Type *ValTy = Ctx.Types.inferScalarType(
       getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
                                 : getStoredValues()[InsertPosIdx]);
   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
   unsigned AS = getLoadStoreAddressSpace(InsertPos);

   unsigned InterleaveFactor = IG->getFactor();
   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);

   // Holds the indices of existing members in the interleaved group.
   SmallVector<unsigned, 4> Indices;
   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
     if (IG->getMember(IF))
       Indices.push_back(IF);

   // Calculate the cost of the whole interleaved group.
   InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
       InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
       IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);

   if (!IG->isReverse())
     return Cost;

   return Cost + IG->getNumMembers() *
                     Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
                                            VectorTy, VectorTy, {}, Ctx.CostKind,
                                            0);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
                                    VPSlotTracker &SlotTracker) const {
   O << Indent << "EMIT ";
   printAsOperand(O, SlotTracker);
   O << " = CANONICAL-INDUCTION ";
   printOperands(O, SlotTracker);
 }
 #endif

 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
   return IsScalarAfterVectorization &&
          (!IsScalable || vputils::onlyFirstLaneUsed(this));
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                           VPSlotTracker &SlotTracker) const {
   assert((getNumOperands() == 3 || getNumOperands() == 5) &&
          "unexpected number of operands");
   O << Indent << "EMIT ";
   printAsOperand(O, SlotTracker);
   O << " = WIDEN-POINTER-INDUCTION ";
   getStartValue()->printAsOperand(O, SlotTracker);
   O << ", ";
   getStepValue()->printAsOperand(O, SlotTracker);
   O << ", ";
   getOperand(2)->printAsOperand(O, SlotTracker);
   if (getNumOperands() == 5) {
     O << ", ";
     getOperand(3)->printAsOperand(O, SlotTracker);
     O << ", ";
     getOperand(4)->printAsOperand(O, SlotTracker);
   }
 }

 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
                                VPSlotTracker &SlotTracker) const {
   O << Indent << "EMIT ";
   printAsOperand(O, SlotTracker);
   O << " = EXPAND SCEV " << *Expr;
 }
 #endif

 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
   Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
   Type *STy = CanonicalIV->getType();
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   ElementCount VF = State.VF;
   Value *VStart = VF.isScalar()
                       ? CanonicalIV
                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
   Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this));
   if (VF.isVector()) {
     VStep = Builder.CreateVectorSplat(VF, VStep);
     VStep =
         Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
   }
   Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
   State.set(this, CanonicalVectorIV);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
                                      VPSlotTracker &SlotTracker) const {
   O << Indent << "EMIT ";
   printAsOperand(O, SlotTracker);
   O << " = WIDEN-CANONICAL-INDUCTION ";
   printOperands(O, SlotTracker);
 }
 #endif

 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
   // Create a vector from the initial value.
   auto *VectorInit = getStartValue()->getLiveInIRValue();

   Type *VecTy = State.VF.isScalar()
                     ? VectorInit->getType()
                     : VectorType::get(VectorInit->getType(), State.VF);

   BasicBlock *VectorPH =
       State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
   if (State.VF.isVector()) {
     auto *IdxTy = Builder.getInt32Ty();
     auto *One = ConstantInt::get(IdxTy, 1);
     IRBuilder<>::InsertPointGuard Guard(Builder);
     Builder.SetInsertPoint(VectorPH->getTerminator());
     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
     VectorInit = Builder.CreateInsertElement(
         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
   }

   // Create a phi node for the new recurrence.
   PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
   Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
   Phi->addIncoming(VectorInit, VectorPH);
   State.set(this, Phi);
 }

 InstructionCost
 VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
                                              VPCostContext &Ctx) const {
   if (VF.isScalar())
     return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);

   return 0;
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
                                             VPSlotTracker &SlotTracker) const {
   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
   printAsOperand(O, SlotTracker);
   O << " = phi ";
   printOperands(O, SlotTracker);
 }
 #endif

 void VPReductionPHIRecipe::execute(VPTransformState &State) {
   // Reductions do not have to start at zero. They can start with
   // any loop invariant values.
   VPValue *StartVPV = getStartValue();

   // In order to support recurrences we need to be able to vectorize Phi nodes.
   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
   // this value when we vectorize all of the instructions that use the PHI.
   BasicBlock *VectorPH =
       State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
   Value *StartV = State.get(StartVPV, ScalarPHI);
   Type *VecTy = StartV->getType();

   BasicBlock *HeaderBB = State.CFG.PrevBB;
   assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
          "recipe must be in the vector loop header");
   auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
   Phi->insertBefore(HeaderBB->getFirstInsertionPt());
   State.set(this, Phi, IsInLoop);

   Phi->addIncoming(StartV, VectorPH);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
                                  VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-REDUCTION-PHI ";

   printAsOperand(O, SlotTracker);
   O << " = phi ";
   printOperands(O, SlotTracker);
   if (VFScaleFactor != 1)
     O << " (VF scaled by 1/" << VFScaleFactor << ")";
 }
 #endif

 void VPWidenPHIRecipe::execute(VPTransformState &State) {
   Value *Op0 = State.get(getOperand(0));
   Type *VecTy = Op0->getType();
   Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
   State.set(this, VecPhi);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
                              VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-PHI ";

   printAsOperand(O, SlotTracker);
   O << " = phi ";
   printPhiOperands(O, SlotTracker);
 }
 #endif

 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
 // remove VPActiveLaneMaskPHIRecipe.
 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
   BasicBlock *VectorPH =
       State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
   Value *StartMask = State.get(getOperand(0));
   PHINode *Phi =
       State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
   Phi->addIncoming(StartMask, VectorPH);
   State.set(this, Phi);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
                                       VPSlotTracker &SlotTracker) const {
   O << Indent << "ACTIVE-LANE-MASK-PHI ";

   printAsOperand(O, SlotTracker);
   O << " = phi ";
   printOperands(O, SlotTracker);
 }
 #endif

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
   O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";

   printAsOperand(O, SlotTracker);
   O << " = phi ";
   printOperands(O, SlotTracker);
 }
 #endif