include/llvm/CodeGen/BasicTTIImpl.h - llvm - Git at Google

 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 /// \file
 /// This file provides a helper that implements much of the TTI interface in
 /// terms of the target-independent code generator and TargetLowering
 /// interfaces.
 ///
 //===----------------------------------------------------------------------===//

 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H
 #define LLVM_CODEGEN_BASICTTIIMPL_H

 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfoImpl.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"

 namespace llvm {

 extern cl::opt<unsigned> PartialUnrollingThreshold;

 /// \brief Base class which can be used to help build a TTI implementation.
 ///
 /// This class provides as much implementation of the TTI interface as is
 /// possible using the target independent parts of the code generator.
 ///
 /// In order to subclass it, your class must implement a getST() method to
 /// return the subtarget, and a getTLI() method to return the target lowering.
 /// We need these methods implemented in the derived class so that this class
 /// doesn't have to duplicate storage for them.
 template <typename T>
 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 private:
   typedef TargetTransformInfoImplCRTPBase<T> BaseT;
   typedef TargetTransformInfo TTI;

   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the result needs to be inserted and/or extracted from vectors.
   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
     assert(Ty->isVectorTy() && "Can only scalarize vectors");
     unsigned Cost = 0;

     for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
       if (Insert)
         Cost += static_cast<T *>(this)
                     ->getVectorInstrCost(Instruction::InsertElement, Ty, i);
       if (Extract)
         Cost += static_cast<T *>(this)
                     ->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
     }

     return Cost;
   }

   /// Estimate the cost overhead of SK_Alternate shuffle.
   unsigned getAltShuffleOverhead(Type *Ty) {
     assert(Ty->isVectorTy() && "Can only shuffle vectors");
     unsigned Cost = 0;
     // Shuffle cost is equal to the cost of extracting element from its argument
     // plus the cost of inserting them onto the result vector.

     // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
     // index 0 of first vector, index 1 of second vector,index 2 of first
     // vector and finally index 3 of second vector and insert them at index
     // <0,1,2,3> of result vector.
     for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
       Cost += static_cast<T *>(this)
                   ->getVectorInstrCost(Instruction::InsertElement, Ty, i);
       Cost += static_cast<T *>(this)
                   ->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
     }
     return Cost;
   }

   /// \brief Local query method delegates up to T which *must* implement this!
   const TargetSubtargetInfo *getST() const {
     return static_cast<const T *>(this)->getST();
   }

   /// \brief Local query method delegates up to T which *must* implement this!
   const TargetLoweringBase *getTLI() const {
     return static_cast<const T *>(this)->getTLI();
   }

 protected:
   explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
       : BaseT(DL) {}

   using TargetTransformInfoImplBase::DL;

 public:
   // Provide value semantics. MSVC requires that we spell all of these out.
   BasicTTIImplBase(const BasicTTIImplBase &Arg)
       : BaseT(static_cast<const BaseT &>(Arg)) {}
   BasicTTIImplBase(BasicTTIImplBase &&Arg)
       : BaseT(std::move(static_cast<BaseT &>(Arg))) {}

   /// \name Scalar TTI Implementations
   /// @{

   bool hasBranchDivergence() { return false; }

   bool isSourceOfDivergence(const Value *V) { return false; }

   bool isLegalAddImmediate(int64_t imm) {
     return getTLI()->isLegalAddImmediate(imm);
   }

   bool isLegalICmpImmediate(int64_t imm) {
     return getTLI()->isLegalICmpImmediate(imm);
   }

   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                              bool HasBaseReg, int64_t Scale,
                              unsigned AddrSpace) {
     TargetLoweringBase::AddrMode AM;
     AM.BaseGV = BaseGV;
     AM.BaseOffs = BaseOffset;
     AM.HasBaseReg = HasBaseReg;
     AM.Scale = Scale;
     return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace);
   }

   int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                            bool HasBaseReg, int64_t Scale, unsigned AddrSpace) {
     TargetLoweringBase::AddrMode AM;
     AM.BaseGV = BaseGV;
     AM.BaseOffs = BaseOffset;
     AM.HasBaseReg = HasBaseReg;
     AM.Scale = Scale;
     return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
   }

   bool isTruncateFree(Type *Ty1, Type *Ty2) {
     return getTLI()->isTruncateFree(Ty1, Ty2);
   }

   bool isProfitableToHoist(Instruction *I) {
     return getTLI()->isProfitableToHoist(I);
   }

   bool isTypeLegal(Type *Ty) {
     EVT VT = getTLI()->getValueType(DL, Ty);
     return getTLI()->isTypeLegal(VT);
   }

   unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
                             ArrayRef<const Value *> Arguments) {
     return BaseT::getIntrinsicCost(IID, RetTy, Arguments);
   }

   unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
                             ArrayRef<Type *> ParamTys) {
     if (IID == Intrinsic::cttz) {
       if (getTLI()->isCheapToSpeculateCttz())
         return TargetTransformInfo::TCC_Basic;
       return TargetTransformInfo::TCC_Expensive;
     }

     if (IID == Intrinsic::ctlz) {
        if (getTLI()->isCheapToSpeculateCtlz())
         return TargetTransformInfo::TCC_Basic;
       return TargetTransformInfo::TCC_Expensive;
     }

     return BaseT::getIntrinsicCost(IID, RetTy, ParamTys);
   }

   unsigned getJumpBufAlignment() { return getTLI()->getJumpBufAlignment(); }

   unsigned getJumpBufSize() { return getTLI()->getJumpBufSize(); }

   bool shouldBuildLookupTables() {
     const TargetLoweringBase *TLI = getTLI();
     return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
            TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
   }

   bool haveFastSqrt(Type *Ty) {
     const TargetLoweringBase *TLI = getTLI();
     EVT VT = TLI->getValueType(DL, Ty);
     return TLI->isTypeLegal(VT) &&
            TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
   }

   unsigned getFPOpCost(Type *Ty) {
     // By default, FP instructions are no more expensive since they are
     // implemented in HW.  Target specific TTI can override this.
     return TargetTransformInfo::TCC_Basic;
   }

   unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) {
     const TargetLoweringBase *TLI = getTLI();
     switch (Opcode) {
     default: break;
     case Instruction::Trunc: {
       if (TLI->isTruncateFree(OpTy, Ty))
         return TargetTransformInfo::TCC_Free;
       return TargetTransformInfo::TCC_Basic;
     }
     case Instruction::ZExt: {
       if (TLI->isZExtFree(OpTy, Ty))
         return TargetTransformInfo::TCC_Free;
       return TargetTransformInfo::TCC_Basic;
     }
     }

     return BaseT::getOperationCost(Opcode, Ty, OpTy);
   }

   void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP) {
     // This unrolling functionality is target independent, but to provide some
     // motivation for its intended use, for x86:

     // According to the Intel 64 and IA-32 Architectures Optimization Reference
     // Manual, Intel Core models and later have a loop stream detector (and
     // associated uop queue) that can benefit from partial unrolling.
     // The relevant requirements are:
     //  - The loop must have no more than 4 (8 for Nehalem and later) branches
     //    taken, and none of them may be calls.
     //  - The loop can have no more than 18 (28 for Nehalem and later) uops.

     // According to the Software Optimization Guide for AMD Family 15h
     // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
     // and loop buffer which can benefit from partial unrolling.
     // The relevant requirements are:
     //  - The loop must have fewer than 16 branches
     //  - The loop must have less than 40 uops in all executed loop branches

     // The number of taken branches in a loop is hard to estimate here, and
     // benchmarking has revealed that it is better not to be conservative when
     // estimating the branch count. As a result, we'll ignore the branch limits
     // until someone finds a case where it matters in practice.

     unsigned MaxOps;
     const TargetSubtargetInfo *ST = getST();
     if (PartialUnrollingThreshold.getNumOccurrences() > 0)
       MaxOps = PartialUnrollingThreshold;
     else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
       MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
     else
       return;

     // Scan the loop: don't unroll loops with calls.
     for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E;
          ++I) {
       BasicBlock *BB = *I;

       for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J)
         if (isa<CallInst>(J) || isa<InvokeInst>(J)) {
           ImmutableCallSite CS(J);
           if (const Function *F = CS.getCalledFunction()) {
             if (!static_cast<T *>(this)->isLoweredToCall(F))
               continue;
           }

           return;
         }
     }

     // Enable runtime and partial unrolling up to the specified size.
     UP.Partial = UP.Runtime = true;
     UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps;
   }

   /// @}

   /// \name Vector TTI Implementations
   /// @{

   unsigned getNumberOfRegisters(bool Vector) { return Vector ? 0 : 1; }

   unsigned getRegisterBitWidth(bool Vector) { return 32; }

   unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }

   unsigned getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None) {
     // Check if any of the operands are vector operands.
     const TargetLoweringBase *TLI = getTLI();
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
     assert(ISD && "Invalid opcode");

     std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);

     bool IsFloat = Ty->getScalarType()->isFloatingPointTy();
     // Assume that floating point arithmetic operations cost twice as much as
     // integer operations.
     unsigned OpCost = (IsFloat ? 2 : 1);

     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
       // The operation is legal. Assume it costs 1.
       // If the type is split to multiple registers, assume that there is some
       // overhead to this.
       // TODO: Once we have extract/insert subvector cost we need to use them.
       if (LT.first > 1)
         return LT.first * 2 * OpCost;
       return LT.first * 1 * OpCost;
     }

     if (!TLI->isOperationExpand(ISD, LT.second)) {
       // If the operation is custom lowered then assume
       // thare the code is twice as expensive.
       return LT.first * 2 * OpCost;
     }

     // Else, assume that we need to scalarize this op.
     if (Ty->isVectorTy()) {
       unsigned Num = Ty->getVectorNumElements();
       unsigned Cost = static_cast<T *>(this)
                           ->getArithmeticInstrCost(Opcode, Ty->getScalarType());
       // return the cost of multiple scalar invocation plus the cost of
       // inserting
       // and extracting the values.
       return getScalarizationOverhead(Ty, true, true) + Num * Cost;
     }

     // We don't know anything about this scalar instruction.
     return OpCost;
   }

   unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                           Type *SubTp) {
     if (Kind == TTI::SK_Alternate) {
       return getAltShuffleOverhead(Tp);
     }
     return 1;
   }

   unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
     const TargetLoweringBase *TLI = getTLI();
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
     assert(ISD && "Invalid opcode");
     std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, Src);
     std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(DL, Dst);

     // Check for NOOP conversions.
     if (SrcLT.first == DstLT.first &&
         SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {

       // Bitcast between types that are legalized to the same type are free.
       if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc)
         return 0;
     }

     if (Opcode == Instruction::Trunc &&
         TLI->isTruncateFree(SrcLT.second, DstLT.second))
       return 0;

     if (Opcode == Instruction::ZExt &&
         TLI->isZExtFree(SrcLT.second, DstLT.second))
       return 0;

     // If the cast is marked as legal (or promote) then assume low cost.
     if (SrcLT.first == DstLT.first &&
         TLI->isOperationLegalOrPromote(ISD, DstLT.second))
       return 1;

     // Handle scalar conversions.
     if (!Src->isVectorTy() && !Dst->isVectorTy()) {

       // Scalar bitcasts are usually free.
       if (Opcode == Instruction::BitCast)
         return 0;

       // Just check the op cost. If the operation is legal then assume it costs
       // 1.
       if (!TLI->isOperationExpand(ISD, DstLT.second))
         return 1;

       // Assume that illegal scalar instruction are expensive.
       return 4;
     }

     // Check vector-to-vector casts.
     if (Dst->isVectorTy() && Src->isVectorTy()) {

       // If the cast is between same-sized registers, then the check is simple.
       if (SrcLT.first == DstLT.first &&
           SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {

         // Assume that Zext is done using AND.
         if (Opcode == Instruction::ZExt)
           return 1;

         // Assume that sext is done using SHL and SRA.
         if (Opcode == Instruction::SExt)
           return 2;

         // Just check the op cost. If the operation is legal then assume it
         // costs
         // 1 and multiply by the type-legalization overhead.
         if (!TLI->isOperationExpand(ISD, DstLT.second))
           return SrcLT.first * 1;
       }

       // If we are converting vectors and the operation is illegal, or
       // if the vectors are legalized to different types, estimate the
       // scalarization costs.
       unsigned Num = Dst->getVectorNumElements();
       unsigned Cost = static_cast<T *>(this)->getCastInstrCost(
           Opcode, Dst->getScalarType(), Src->getScalarType());

       // Return the cost of multiple scalar invocation plus the cost of
       // inserting and extracting the values.
       return getScalarizationOverhead(Dst, true, true) + Num * Cost;
     }

     // We already handled vector-to-vector and scalar-to-scalar conversions.
     // This
     // is where we handle bitcast between vectors and scalars. We need to assume
     //  that the conversion is scalarized in one way or another.
     if (Opcode == Instruction::BitCast)
       // Illegal bitcasts are done by storing and loading from a stack slot.
       return (Src->isVectorTy() ? getScalarizationOverhead(Src, false, true)
                                 : 0) +
              (Dst->isVectorTy() ? getScalarizationOverhead(Dst, true, false)
                                 : 0);

     llvm_unreachable("Unhandled cast");
   }

   unsigned getCFInstrCost(unsigned Opcode) {
     // Branches are assumed to be predicted.
     return 0;
   }

   unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
     const TargetLoweringBase *TLI = getTLI();
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
     assert(ISD && "Invalid opcode");

     // Selects on vectors are actually vector selects.
     if (ISD == ISD::SELECT) {
       assert(CondTy && "CondTy must exist");
       if (CondTy->isVectorTy())
         ISD = ISD::VSELECT;
     }
     std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);

     if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
         !TLI->isOperationExpand(ISD, LT.second)) {
       // The operation is legal. Assume it costs 1. Multiply
       // by the type-legalization overhead.
       return LT.first * 1;
     }

     // Otherwise, assume that the cast is scalarized.
     if (ValTy->isVectorTy()) {
       unsigned Num = ValTy->getVectorNumElements();
       if (CondTy)
         CondTy = CondTy->getScalarType();
       unsigned Cost = static_cast<T *>(this)->getCmpSelInstrCost(
           Opcode, ValTy->getScalarType(), CondTy);

       // Return the cost of multiple scalar invocation plus the cost of
       // inserting
       // and extracting the values.
       return getScalarizationOverhead(ValTy, true, false) + Num * Cost;
     }

     // Unknown scalar opcode.
     return 1;
   }

   unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
     std::pair<unsigned, MVT> LT =
         getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());

     return LT.first;
   }

   unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                            unsigned AddressSpace) {
     assert(!Src->isVoidTy() && "Invalid type");
     std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Src);

     // Assuming that all loads of legal types cost 1.
     unsigned Cost = LT.first;

     if (Src->isVectorTy() &&
         Src->getPrimitiveSizeInBits() < LT.second.getSizeInBits()) {
       // This is a vector load that legalizes to a larger type than the vector
       // itself. Unless the corresponding extending load or truncating store is
       // legal, then this will scalarize.
       TargetLowering::LegalizeAction LA = TargetLowering::Expand;
       EVT MemVT = getTLI()->getValueType(DL, Src, true);
       if (MemVT.isSimple() && MemVT != MVT::Other) {
         if (Opcode == Instruction::Store)
           LA = getTLI()->getTruncStoreAction(LT.second, MemVT.getSimpleVT());
         else
           LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
       }

       if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
         // This is a vector load/store for some illegal type that is scalarized.
         // We must account for the cost of building or decomposing the vector.
         Cost += getScalarizationOverhead(Src, Opcode != Instruction::Store,
                                          Opcode == Instruction::Store);
       }
     }

     return Cost;
   }

   unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                       unsigned Factor,
                                       ArrayRef<unsigned> Indices,
                                       unsigned Alignment,
                                       unsigned AddressSpace) {
     VectorType *VT = dyn_cast<VectorType>(VecTy);
     assert(VT && "Expect a vector type for interleaved memory op");

     unsigned NumElts = VT->getNumElements();
     assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");

     unsigned NumSubElts = NumElts / Factor;
     VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts);

     // Firstly, the cost of load/store operation.
     unsigned Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);

     // Then plus the cost of interleave operation.
     if (Opcode == Instruction::Load) {
       // The interleave cost is similar to extract sub vectors' elements
       // from the wide vector, and insert them into sub vectors.
       //
       // E.g. An interleaved load of factor 2 (with one member of index 0):
       //      %vec = load <8 x i32>, <8 x i32>* %ptr
       //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
       // The cost is estimated as extract elements at 0, 2, 4, 6 from the
       // <8 x i32> vector and insert them into a <4 x i32> vector.

       assert(Indices.size() <= Factor &&
              "Interleaved memory op has too many members");
       for (unsigned Index : Indices) {
         assert(Index < Factor && "Invalid index for interleaved memory op");

         // Extract elements from loaded vector for each sub vector.
         for (unsigned i = 0; i < NumSubElts; i++)
           Cost += getVectorInstrCost(Instruction::ExtractElement, VT,
                                      Index + i * Factor);
       }

       unsigned InsSubCost = 0;
       for (unsigned i = 0; i < NumSubElts; i++)
         InsSubCost += getVectorInstrCost(Instruction::InsertElement, SubVT, i);

       Cost += Indices.size() * InsSubCost;
     } else {
       // The interleave cost is extract all elements from sub vectors, and
       // insert them into the wide vector.
       //
       // E.g. An interleaved store of factor 2:
       //      %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
       //      store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
       // The cost is estimated as extract all elements from both <4 x i32>
       // vectors and insert into the <8 x i32> vector.

       unsigned ExtSubCost = 0;
       for (unsigned i = 0; i < NumSubElts; i++)
         ExtSubCost += getVectorInstrCost(Instruction::ExtractElement, SubVT, i);

       Cost += Factor * ExtSubCost;

       for (unsigned i = 0; i < NumElts; i++)
         Cost += getVectorInstrCost(Instruction::InsertElement, VT, i);
     }

     return Cost;
   }

   unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
                                  ArrayRef<Type *> Tys) {
     unsigned ISD = 0;
     switch (IID) {
     default: {
       // Assume that we need to scalarize this intrinsic.
       unsigned ScalarizationCost = 0;
       unsigned ScalarCalls = 1;
       Type *ScalarRetTy = RetTy;
       if (RetTy->isVectorTy()) {
         ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
         ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
         ScalarRetTy = RetTy->getScalarType();
       }
       SmallVector<Type *, 4> ScalarTys;
       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
         Type *Ty = Tys[i];
         if (Ty->isVectorTy()) {
           ScalarizationCost += getScalarizationOverhead(Ty, false, true);
           ScalarCalls = std::max(ScalarCalls, Ty->getVectorNumElements());
           Ty = Ty->getScalarType();
         }
         ScalarTys.push_back(Ty);
       }
       if (ScalarCalls == 1)
         return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.

       unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost(
           IID, ScalarRetTy, ScalarTys);

       return ScalarCalls * ScalarCost + ScalarizationCost;
     }
     // Look for intrinsics that can be lowered directly or turned into a scalar
     // intrinsic call.
     case Intrinsic::sqrt:
       ISD = ISD::FSQRT;
       break;
     case Intrinsic::sin:
       ISD = ISD::FSIN;
       break;
     case Intrinsic::cos:
       ISD = ISD::FCOS;
       break;
     case Intrinsic::exp:
       ISD = ISD::FEXP;
       break;
     case Intrinsic::exp2:
       ISD = ISD::FEXP2;
       break;
     case Intrinsic::log:
       ISD = ISD::FLOG;
       break;
     case Intrinsic::log10:
       ISD = ISD::FLOG10;
       break;
     case Intrinsic::log2:
       ISD = ISD::FLOG2;
       break;
     case Intrinsic::fabs:
       ISD = ISD::FABS;
       break;
     case Intrinsic::minnum:
       ISD = ISD::FMINNUM;
       break;
     case Intrinsic::maxnum:
       ISD = ISD::FMAXNUM;
       break;
     case Intrinsic::copysign:
       ISD = ISD::FCOPYSIGN;
       break;
     case Intrinsic::floor:
       ISD = ISD::FFLOOR;
       break;
     case Intrinsic::ceil:
       ISD = ISD::FCEIL;
       break;
     case Intrinsic::trunc:
       ISD = ISD::FTRUNC;
       break;
     case Intrinsic::nearbyint:
       ISD = ISD::FNEARBYINT;
       break;
     case Intrinsic::rint:
       ISD = ISD::FRINT;
       break;
     case Intrinsic::round:
       ISD = ISD::FROUND;
       break;
     case Intrinsic::pow:
       ISD = ISD::FPOW;
       break;
     case Intrinsic::fma:
       ISD = ISD::FMA;
       break;
     case Intrinsic::fmuladd:
       ISD = ISD::FMA;
       break;
     // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
     case Intrinsic::lifetime_start:
     case Intrinsic::lifetime_end:
       return 0;
     case Intrinsic::masked_store:
       return static_cast<T *>(this)
           ->getMaskedMemoryOpCost(Instruction::Store, Tys[0], 0, 0);
     case Intrinsic::masked_load:
       return static_cast<T *>(this)
           ->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0);
     }

     const TargetLoweringBase *TLI = getTLI();
     std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);

     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
       // The operation is legal. Assume it costs 1.
       // If the type is split to multiple registers, assume that there is some
       // overhead to this.
       // TODO: Once we have extract/insert subvector cost we need to use them.
       if (LT.first > 1)
         return LT.first * 2;
       return LT.first * 1;
     }

     if (!TLI->isOperationExpand(ISD, LT.second)) {
       // If the operation is custom lowered then assume
       // thare the code is twice as expensive.
       return LT.first * 2;
     }

     // If we can't lower fmuladd into an FMA estimate the cost as a floating
     // point mul followed by an add.
     if (IID == Intrinsic::fmuladd)
       return static_cast<T *>(this)
                  ->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) +
              static_cast<T *>(this)
                  ->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy);

     // Else, assume that we need to scalarize this intrinsic. For math builtins
     // this will emit a costly libcall, adding call overhead and spills. Make it
     // very expensive.
     if (RetTy->isVectorTy()) {
       unsigned ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
       unsigned ScalarCalls = RetTy->getVectorNumElements();
       SmallVector<Type *, 4> ScalarTys;
       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
         Type *Ty = Tys[i];
         if (Ty->isVectorTy())
           Ty = Ty->getScalarType();
         ScalarTys.push_back(Ty);
       }
       unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost(
           IID, RetTy->getScalarType(), ScalarTys);
       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
         if (Tys[i]->isVectorTy()) {
           ScalarizationCost += getScalarizationOverhead(Tys[i], false, true);
           ScalarCalls = std::max(ScalarCalls, Tys[i]->getVectorNumElements());
         }
       }

       return ScalarCalls * ScalarCost + ScalarizationCost;
     }

     // This is going to be turned into a library call, make it expensive.
     return 10;
   }

   /// \brief Compute a cost of the given call instruction.
   ///
   /// Compute the cost of calling function F with return type RetTy and
   /// argument types Tys. F might be nullptr, in this case the cost of an
   /// arbitrary call with the specified signature will be returned.
   /// This is used, for instance,  when we estimate call of a vector
   /// counterpart of the given function.
   /// \param F Called function, might be nullptr.
   /// \param RetTy Return value types.
   /// \param Tys Argument types.
   /// \returns The cost of Call instruction.
   unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) {
     return 10;
   }

   unsigned getNumberOfParts(Type *Tp) {
     std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Tp);
     return LT.first;
   }

   unsigned getAddressComputationCost(Type *Ty, bool IsComplex) { return 0; }

   unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise) {
     assert(Ty->isVectorTy() && "Expect a vector type");
     unsigned NumVecElts = Ty->getVectorNumElements();
     unsigned NumReduxLevels = Log2_32(NumVecElts);
     unsigned ArithCost =
         NumReduxLevels *
         static_cast<T *>(this)->getArithmeticInstrCost(Opcode, Ty);
     // Assume the pairwise shuffles add a cost.
     unsigned ShuffleCost =
         NumReduxLevels * (IsPairwise + 1) *
         static_cast<T *>(this)
             ->getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts / 2, Ty);
     return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
   }

   /// @}
 };

 /// \brief Concrete BasicTTIImpl that can be used if no further customization
 /// is needed.
 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
   typedef BasicTTIImplBase<BasicTTIImpl> BaseT;
   friend class BasicTTIImplBase<BasicTTIImpl>;

   const TargetSubtargetInfo *ST;
   const TargetLoweringBase *TLI;

   const TargetSubtargetInfo *getST() const { return ST; }
   const TargetLoweringBase *getTLI() const { return TLI; }

 public:
   explicit BasicTTIImpl(const TargetMachine *ST, Function &F);

   // Provide value semantics. MSVC requires that we spell all of these out.
   BasicTTIImpl(const BasicTTIImpl &Arg)
       : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
   BasicTTIImpl(BasicTTIImpl &&Arg)
       : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
         TLI(std::move(Arg.TLI)) {}
 };

 }

 #endif