llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp - llvm-project - Git at Google

 //===-- RISCVLegalizerInfo.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 /// \file
 /// This file implements the targeting of the Machinelegalizer class for RISC-V.
 /// \todo This should be generated by TableGen.
 //===----------------------------------------------------------------------===//

 #include "RISCVLegalizerInfo.h"
 #include "MCTargetDesc/RISCVMatInt.h"
 #include "RISCVMachineFunctionInfo.h"
 #include "RISCVSubtarget.h"
 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Type.h"

 using namespace llvm;
 using namespace LegalityPredicates;
 using namespace LegalizeMutations;

 // Is this type supported by scalar FP arithmetic operations given the current
 // subtarget.
 static LegalityPredicate typeIsScalarFPArith(unsigned TypeIdx,
                                              const RISCVSubtarget &ST) {
   return [=, &ST](const LegalityQuery &Query) {
     return Query.Types[TypeIdx].isScalar() &&
            ((ST.hasStdExtZfh() && Query.Types[TypeIdx].getSizeInBits() == 16) ||
             (ST.hasStdExtF() && Query.Types[TypeIdx].getSizeInBits() == 32) ||
             (ST.hasStdExtD() && Query.Types[TypeIdx].getSizeInBits() == 64));
   };
 }

 static LegalityPredicate
 typeIsLegalIntOrFPVec(unsigned TypeIdx,
                       std::initializer_list<LLT> IntOrFPVecTys,
                       const RISCVSubtarget &ST) {
   LegalityPredicate P = [=, &ST](const LegalityQuery &Query) {
     return ST.hasVInstructions() &&
            (Query.Types[TypeIdx].getScalarSizeInBits() != 64 ||
             ST.hasVInstructionsI64()) &&
            (Query.Types[TypeIdx].getElementCount().getKnownMinValue() != 1 ||
             ST.getELen() == 64);
   };

   return all(typeInSet(TypeIdx, IntOrFPVecTys), P);
 }

 static LegalityPredicate
 typeIsLegalBoolVec(unsigned TypeIdx, std::initializer_list<LLT> BoolVecTys,
                    const RISCVSubtarget &ST) {
   LegalityPredicate P = [=, &ST](const LegalityQuery &Query) {
     return ST.hasVInstructions() &&
            (Query.Types[TypeIdx].getElementCount().getKnownMinValue() != 1 ||
             ST.getELen() == 64);
   };
   return all(typeInSet(TypeIdx, BoolVecTys), P);
 }

 static LegalityPredicate typeIsLegalPtrVec(unsigned TypeIdx,
                                            std::initializer_list<LLT> PtrVecTys,
                                            const RISCVSubtarget &ST) {
   LegalityPredicate P = [=, &ST](const LegalityQuery &Query) {
     return ST.hasVInstructions() &&
            (Query.Types[TypeIdx].getElementCount().getKnownMinValue() != 1 ||
             ST.getELen() == 64) &&
            (Query.Types[TypeIdx].getElementCount().getKnownMinValue() != 16 ||
             Query.Types[TypeIdx].getScalarSizeInBits() == 32);
   };
   return all(typeInSet(TypeIdx, PtrVecTys), P);
 }

 RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
     : STI(ST), XLen(STI.getXLen()), sXLen(LLT::scalar(XLen)) {
   const LLT sDoubleXLen = LLT::scalar(2 * XLen);
   const LLT p0 = LLT::pointer(0, XLen);
   const LLT s1 = LLT::scalar(1);
   const LLT s8 = LLT::scalar(8);
   const LLT s16 = LLT::scalar(16);
   const LLT s32 = LLT::scalar(32);
   const LLT s64 = LLT::scalar(64);
   const LLT s128 = LLT::scalar(128);

   const LLT nxv1s1 = LLT::scalable_vector(1, s1);
   const LLT nxv2s1 = LLT::scalable_vector(2, s1);
   const LLT nxv4s1 = LLT::scalable_vector(4, s1);
   const LLT nxv8s1 = LLT::scalable_vector(8, s1);
   const LLT nxv16s1 = LLT::scalable_vector(16, s1);
   const LLT nxv32s1 = LLT::scalable_vector(32, s1);
   const LLT nxv64s1 = LLT::scalable_vector(64, s1);

   const LLT nxv1s8 = LLT::scalable_vector(1, s8);
   const LLT nxv2s8 = LLT::scalable_vector(2, s8);
   const LLT nxv4s8 = LLT::scalable_vector(4, s8);
   const LLT nxv8s8 = LLT::scalable_vector(8, s8);
   const LLT nxv16s8 = LLT::scalable_vector(16, s8);
   const LLT nxv32s8 = LLT::scalable_vector(32, s8);
   const LLT nxv64s8 = LLT::scalable_vector(64, s8);

   const LLT nxv1s16 = LLT::scalable_vector(1, s16);
   const LLT nxv2s16 = LLT::scalable_vector(2, s16);
   const LLT nxv4s16 = LLT::scalable_vector(4, s16);
   const LLT nxv8s16 = LLT::scalable_vector(8, s16);
   const LLT nxv16s16 = LLT::scalable_vector(16, s16);
   const LLT nxv32s16 = LLT::scalable_vector(32, s16);

   const LLT nxv1s32 = LLT::scalable_vector(1, s32);
   const LLT nxv2s32 = LLT::scalable_vector(2, s32);
   const LLT nxv4s32 = LLT::scalable_vector(4, s32);
   const LLT nxv8s32 = LLT::scalable_vector(8, s32);
   const LLT nxv16s32 = LLT::scalable_vector(16, s32);

   const LLT nxv1s64 = LLT::scalable_vector(1, s64);
   const LLT nxv2s64 = LLT::scalable_vector(2, s64);
   const LLT nxv4s64 = LLT::scalable_vector(4, s64);
   const LLT nxv8s64 = LLT::scalable_vector(8, s64);

   const LLT nxv1p0 = LLT::scalable_vector(1, p0);
   const LLT nxv2p0 = LLT::scalable_vector(2, p0);
   const LLT nxv4p0 = LLT::scalable_vector(4, p0);
   const LLT nxv8p0 = LLT::scalable_vector(8, p0);
   const LLT nxv16p0 = LLT::scalable_vector(16, p0);

   using namespace TargetOpcode;

   auto BoolVecTys = {nxv1s1, nxv2s1, nxv4s1, nxv8s1, nxv16s1, nxv32s1, nxv64s1};

   auto IntOrFPVecTys = {nxv1s8,   nxv2s8,  nxv4s8,  nxv8s8,  nxv16s8, nxv32s8,
                         nxv64s8,  nxv1s16, nxv2s16, nxv4s16, nxv8s16, nxv16s16,
                         nxv32s16, nxv1s32, nxv2s32, nxv4s32, nxv8s32, nxv16s32,
                         nxv1s64,  nxv2s64, nxv4s64, nxv8s64};

   auto PtrVecTys = {nxv1p0, nxv2p0, nxv4p0, nxv8p0, nxv16p0};

   getActionDefinitionsBuilder({G_ADD, G_SUB, G_AND, G_OR, G_XOR})
       .legalFor({s32, sXLen})
       .legalIf(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST))
       .widenScalarToNextPow2(0)
       .clampScalar(0, s32, sXLen);

   getActionDefinitionsBuilder(
       {G_UADDE, G_UADDO, G_USUBE, G_USUBO}).lower();

   getActionDefinitionsBuilder({G_SADDO, G_SSUBO}).minScalar(0, sXLen).lower();

   // TODO: Use Vector Single-Width Saturating Instructions for vector types.
   getActionDefinitionsBuilder({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT})
       .lower();

   getActionDefinitionsBuilder({G_ASHR, G_LSHR, G_SHL})
       .legalFor({{s32, s32}, {sXLen, sXLen}})
       .widenScalarToNextPow2(0)
       .clampScalar(1, s32, sXLen)
       .clampScalar(0, s32, sXLen)
       .minScalarSameAs(1, 0)
       .maxScalarSameAs(1, 0);

   auto &ExtActions =
       getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
           .legalIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST),
                        typeIsLegalIntOrFPVec(1, IntOrFPVecTys, ST)));
   if (ST.is64Bit()) {
     ExtActions.legalFor({{sXLen, s32}});
     getActionDefinitionsBuilder(G_SEXT_INREG)
         .customFor({s32, sXLen})
         .maxScalar(0, sXLen)
         .lower();
   } else {
     getActionDefinitionsBuilder(G_SEXT_INREG)
         .customFor({s32})
         .maxScalar(0, sXLen)
         .lower();
   }
   ExtActions.customIf(typeIsLegalBoolVec(1, BoolVecTys, ST))
       .maxScalar(0, sXLen);

   // Merge/Unmerge
   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
     auto &MergeUnmergeActions = getActionDefinitionsBuilder(Op);
     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
     if (XLen == 32 && ST.hasStdExtD()) {
       MergeUnmergeActions.legalIf(
           all(typeIs(BigTyIdx, s64), typeIs(LitTyIdx, s32)));
     }
     MergeUnmergeActions.widenScalarToNextPow2(LitTyIdx, XLen)
         .widenScalarToNextPow2(BigTyIdx, XLen)
         .clampScalar(LitTyIdx, sXLen, sXLen)
         .clampScalar(BigTyIdx, sXLen, sXLen);
   }

   getActionDefinitionsBuilder({G_FSHL, G_FSHR}).lower();

   getActionDefinitionsBuilder({G_ROTL, G_ROTR})
       .legalFor(ST.hasStdExtZbb() || ST.hasStdExtZbkb(),
                 {{s32, s32}, {sXLen, sXLen}})
       .lower();

   getActionDefinitionsBuilder(G_BITREVERSE).maxScalar(0, sXLen).lower();

   getActionDefinitionsBuilder(G_BITCAST).legalIf(
       all(LegalityPredicates::any(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST),
                                   typeIsLegalBoolVec(0, BoolVecTys, ST)),
           LegalityPredicates::any(typeIsLegalIntOrFPVec(1, IntOrFPVecTys, ST),
                                   typeIsLegalBoolVec(1, BoolVecTys, ST))));

   auto &BSWAPActions = getActionDefinitionsBuilder(G_BSWAP);
   if (ST.hasStdExtZbb() || ST.hasStdExtZbkb())
     BSWAPActions.legalFor({sXLen}).clampScalar(0, sXLen, sXLen);
   else
     BSWAPActions.maxScalar(0, sXLen).lower();

   auto &CountZerosActions = getActionDefinitionsBuilder({G_CTLZ, G_CTTZ});
   auto &CountZerosUndefActions =
       getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF});
   if (ST.hasStdExtZbb()) {
     CountZerosActions.legalFor({{s32, s32}, {sXLen, sXLen}})
         .clampScalar(0, s32, sXLen)
         .widenScalarToNextPow2(0)
         .scalarSameSizeAs(1, 0);
   } else {
     CountZerosActions.maxScalar(0, sXLen).scalarSameSizeAs(1, 0).lower();
     CountZerosUndefActions.maxScalar(0, sXLen).scalarSameSizeAs(1, 0);
   }
   CountZerosUndefActions.lower();

   auto &CTPOPActions = getActionDefinitionsBuilder(G_CTPOP);
   if (ST.hasStdExtZbb()) {
     CTPOPActions.legalFor({{s32, s32}, {sXLen, sXLen}})
         .clampScalar(0, s32, sXLen)
         .widenScalarToNextPow2(0)
         .scalarSameSizeAs(1, 0);
   } else {
     CTPOPActions.maxScalar(0, sXLen).scalarSameSizeAs(1, 0).lower();
   }

   getActionDefinitionsBuilder(G_CONSTANT)
       .legalFor({s32, p0})
       .customFor(ST.is64Bit(), {s64})
       .widenScalarToNextPow2(0)
       .clampScalar(0, s32, sXLen);

   // TODO: transform illegal vector types into legal vector type
   getActionDefinitionsBuilder(
       {G_IMPLICIT_DEF, G_CONSTANT_FOLD_BARRIER, G_FREEZE})
       .legalFor({s32, sXLen, p0})
       .legalIf(typeIsLegalBoolVec(0, BoolVecTys, ST))
       .legalIf(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST))
       .widenScalarToNextPow2(0)
       .clampScalar(0, s32, sXLen);

   getActionDefinitionsBuilder(G_ICMP)
       .legalFor({{sXLen, sXLen}, {sXLen, p0}})
       .legalIf(all(typeIsLegalBoolVec(0, BoolVecTys, ST),
                    typeIsLegalIntOrFPVec(1, IntOrFPVecTys, ST)))
       .widenScalarOrEltToNextPow2OrMinSize(1, 8)
       .clampScalar(1, sXLen, sXLen)
       .clampScalar(0, sXLen, sXLen);

   getActionDefinitionsBuilder(G_SELECT)
       .legalFor({{s32, sXLen}, {p0, sXLen}})
       .legalIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST),
                    typeIsLegalBoolVec(1, BoolVecTys, ST)))
       .legalFor(XLen == 64 || ST.hasStdExtD(), {{s64, sXLen}})
       .widenScalarToNextPow2(0)
       .clampScalar(0, s32, (XLen == 64 || ST.hasStdExtD()) ? s64 : s32)
       .clampScalar(1, sXLen, sXLen);

   auto &LoadActions = getActionDefinitionsBuilder(G_LOAD);
   auto &StoreActions = getActionDefinitionsBuilder(G_STORE);
   auto &ExtLoadActions = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD});

   // Return the alignment needed for scalar memory ops. If unaligned scalar mem
   // is supported, we only require byte alignment. Otherwise, we need the memory
   // op to be natively aligned.
   auto getScalarMemAlign = [&ST](unsigned Size) {
     return ST.enableUnalignedScalarMem() ? 8 : Size;
   };

   LoadActions.legalForTypesWithMemDesc(
       {{s32, p0, s8, getScalarMemAlign(8)},
        {s32, p0, s16, getScalarMemAlign(16)},
        {s32, p0, s32, getScalarMemAlign(32)},
        {p0, p0, sXLen, getScalarMemAlign(XLen)}});
   StoreActions.legalForTypesWithMemDesc(
       {{s32, p0, s8, getScalarMemAlign(8)},
        {s32, p0, s16, getScalarMemAlign(16)},
        {s32, p0, s32, getScalarMemAlign(32)},
        {p0, p0, sXLen, getScalarMemAlign(XLen)}});
   ExtLoadActions.legalForTypesWithMemDesc(
       {{s32, p0, s8, getScalarMemAlign(8)},
        {s32, p0, s16, getScalarMemAlign(16)}});
   if (XLen == 64) {
     LoadActions.legalForTypesWithMemDesc(
         {{s64, p0, s8, getScalarMemAlign(8)},
          {s64, p0, s16, getScalarMemAlign(16)},
          {s64, p0, s32, getScalarMemAlign(32)},
          {s64, p0, s64, getScalarMemAlign(64)}});
     StoreActions.legalForTypesWithMemDesc(
         {{s64, p0, s8, getScalarMemAlign(8)},
          {s64, p0, s16, getScalarMemAlign(16)},
          {s64, p0, s32, getScalarMemAlign(32)},
          {s64, p0, s64, getScalarMemAlign(64)}});
     ExtLoadActions.legalForTypesWithMemDesc(
         {{s64, p0, s8, getScalarMemAlign(8)},
          {s64, p0, s16, getScalarMemAlign(16)},
          {s64, p0, s32, getScalarMemAlign(32)}});
   } else if (ST.hasStdExtD()) {
     LoadActions.legalForTypesWithMemDesc(
         {{s64, p0, s64, getScalarMemAlign(64)}});
     StoreActions.legalForTypesWithMemDesc(
         {{s64, p0, s64, getScalarMemAlign(64)}});
   }

   // Vector loads/stores.
   if (ST.hasVInstructions()) {
     LoadActions.legalForTypesWithMemDesc({{nxv2s8, p0, nxv2s8, 8},
                                           {nxv4s8, p0, nxv4s8, 8},
                                           {nxv8s8, p0, nxv8s8, 8},
                                           {nxv16s8, p0, nxv16s8, 8},
                                           {nxv32s8, p0, nxv32s8, 8},
                                           {nxv64s8, p0, nxv64s8, 8},
                                           {nxv2s16, p0, nxv2s16, 16},
                                           {nxv4s16, p0, nxv4s16, 16},
                                           {nxv8s16, p0, nxv8s16, 16},
                                           {nxv16s16, p0, nxv16s16, 16},
                                           {nxv32s16, p0, nxv32s16, 16},
                                           {nxv2s32, p0, nxv2s32, 32},
                                           {nxv4s32, p0, nxv4s32, 32},
                                           {nxv8s32, p0, nxv8s32, 32},
                                           {nxv16s32, p0, nxv16s32, 32}});
     StoreActions.legalForTypesWithMemDesc({{nxv2s8, p0, nxv2s8, 8},
                                            {nxv4s8, p0, nxv4s8, 8},
                                            {nxv8s8, p0, nxv8s8, 8},
                                            {nxv16s8, p0, nxv16s8, 8},
                                            {nxv32s8, p0, nxv32s8, 8},
                                            {nxv64s8, p0, nxv64s8, 8},
                                            {nxv2s16, p0, nxv2s16, 16},
                                            {nxv4s16, p0, nxv4s16, 16},
                                            {nxv8s16, p0, nxv8s16, 16},
                                            {nxv16s16, p0, nxv16s16, 16},
                                            {nxv32s16, p0, nxv32s16, 16},
                                            {nxv2s32, p0, nxv2s32, 32},
                                            {nxv4s32, p0, nxv4s32, 32},
                                            {nxv8s32, p0, nxv8s32, 32},
                                            {nxv16s32, p0, nxv16s32, 32}});

     if (ST.getELen() == 64) {
       LoadActions.legalForTypesWithMemDesc({{nxv1s8, p0, nxv1s8, 8},
                                             {nxv1s16, p0, nxv1s16, 16},
                                             {nxv1s32, p0, nxv1s32, 32}});
       StoreActions.legalForTypesWithMemDesc({{nxv1s8, p0, nxv1s8, 8},
                                              {nxv1s16, p0, nxv1s16, 16},
                                              {nxv1s32, p0, nxv1s32, 32}});
     }

     if (ST.hasVInstructionsI64()) {
       LoadActions.legalForTypesWithMemDesc({{nxv1s64, p0, nxv1s64, 64},
                                             {nxv2s64, p0, nxv2s64, 64},
                                             {nxv4s64, p0, nxv4s64, 64},
                                             {nxv8s64, p0, nxv8s64, 64}});
       StoreActions.legalForTypesWithMemDesc({{nxv1s64, p0, nxv1s64, 64},
                                              {nxv2s64, p0, nxv2s64, 64},
                                              {nxv4s64, p0, nxv4s64, 64},
                                              {nxv8s64, p0, nxv8s64, 64}});
     }

     // we will take the custom lowering logic if we have scalable vector types
     // with non-standard alignments
     LoadActions.customIf(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST));
     StoreActions.customIf(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST));

     // Pointers require that XLen sized elements are legal.
     if (XLen <= ST.getELen()) {
       LoadActions.customIf(typeIsLegalPtrVec(0, PtrVecTys, ST));
       StoreActions.customIf(typeIsLegalPtrVec(0, PtrVecTys, ST));
     }
   }

   LoadActions.widenScalarToNextPow2(0, /* MinSize = */ 8)
       .lowerIfMemSizeNotByteSizePow2()
       .clampScalar(0, s32, sXLen)
       .lower();
   StoreActions
       .clampScalar(0, s32, sXLen)
       .lowerIfMemSizeNotByteSizePow2()
       .lower();

   ExtLoadActions.widenScalarToNextPow2(0).clampScalar(0, s32, sXLen).lower();

   getActionDefinitionsBuilder({G_PTR_ADD, G_PTRMASK}).legalFor({{p0, sXLen}});

   getActionDefinitionsBuilder(G_PTRTOINT)
       .legalFor({{sXLen, p0}})
       .clampScalar(0, sXLen, sXLen);

   getActionDefinitionsBuilder(G_INTTOPTR)
       .legalFor({{p0, sXLen}})
       .clampScalar(1, sXLen, sXLen);

   getActionDefinitionsBuilder(G_BRCOND).legalFor({sXLen}).minScalar(0, sXLen);

   getActionDefinitionsBuilder(G_BRJT).legalFor({{p0, sXLen}});

   getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0});

   getActionDefinitionsBuilder(G_PHI)
       .legalFor({p0, sXLen})
       .widenScalarToNextPow2(0)
       .clampScalar(0, sXLen, sXLen);

   getActionDefinitionsBuilder({G_GLOBAL_VALUE, G_JUMP_TABLE, G_CONSTANT_POOL})
       .legalFor({p0});

   if (ST.hasStdExtZmmul()) {
     getActionDefinitionsBuilder(G_MUL)
         .legalFor({s32, sXLen})
         .widenScalarToNextPow2(0)
         .clampScalar(0, s32, sXLen);

     // clang-format off
     getActionDefinitionsBuilder({G_SMULH, G_UMULH})
         .legalFor({sXLen})
         .lower();
     // clang-format on

     getActionDefinitionsBuilder({G_SMULO, G_UMULO}).minScalar(0, sXLen).lower();
   } else {
     getActionDefinitionsBuilder(G_MUL)
         .libcallFor({sXLen, sDoubleXLen})
         .widenScalarToNextPow2(0)
         .clampScalar(0, sXLen, sDoubleXLen);

     getActionDefinitionsBuilder({G_SMULH, G_UMULH}).lowerFor({sXLen});

     getActionDefinitionsBuilder({G_SMULO, G_UMULO})
         .minScalar(0, sXLen)
         // Widen sXLen to sDoubleXLen so we can use a single libcall to get
         // the low bits for the mul result and high bits to do the overflow
         // check.
         .widenScalarIf(typeIs(0, sXLen),
                        LegalizeMutations::changeTo(0, sDoubleXLen))
         .lower();
   }

   if (ST.hasStdExtM()) {
     getActionDefinitionsBuilder({G_UDIV, G_SDIV, G_UREM, G_SREM})
         .legalFor({s32, sXLen})
         .libcallFor({sDoubleXLen})
         .clampScalar(0, s32, sDoubleXLen)
         .widenScalarToNextPow2(0);
   } else {
     getActionDefinitionsBuilder({G_UDIV, G_SDIV, G_UREM, G_SREM})
         .libcallFor({sXLen, sDoubleXLen})
         .clampScalar(0, sXLen, sDoubleXLen)
         .widenScalarToNextPow2(0);
   }

   // TODO: Use libcall for sDoubleXLen.
   getActionDefinitionsBuilder({G_UDIVREM, G_SDIVREM}).lower();

   getActionDefinitionsBuilder(G_ABS)
       .customFor(ST.hasStdExtZbb(), {sXLen})
       .minScalar(ST.hasStdExtZbb(), 0, sXLen)
       .lower();

   getActionDefinitionsBuilder({G_UMAX, G_UMIN, G_SMAX, G_SMIN})
       .legalFor(ST.hasStdExtZbb(), {sXLen})
       .minScalar(ST.hasStdExtZbb(), 0, sXLen)
       .lower();

   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});

   getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall();

   getActionDefinitionsBuilder({G_DYN_STACKALLOC, G_STACKSAVE, G_STACKRESTORE})
       .lower();

   // FP Operations

   getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FNEG,
                                G_FABS, G_FSQRT, G_FMAXNUM, G_FMINNUM})
       .legalIf(typeIsScalarFPArith(0, ST));

   getActionDefinitionsBuilder(G_FREM)
       .libcallFor({s32, s64})
       .minScalar(0, s32)
       .scalarize(0);

   getActionDefinitionsBuilder(G_FCOPYSIGN)
       .legalIf(all(typeIsScalarFPArith(0, ST), typeIsScalarFPArith(1, ST)));

   // FIXME: Use Zfhmin.
   getActionDefinitionsBuilder(G_FPTRUNC).legalIf(
       [=, &ST](const LegalityQuery &Query) -> bool {
         return (ST.hasStdExtD() && typeIs(0, s32)(Query) &&
                 typeIs(1, s64)(Query)) ||
                (ST.hasStdExtZfh() && typeIs(0, s16)(Query) &&
                 typeIs(1, s32)(Query)) ||
                (ST.hasStdExtZfh() && ST.hasStdExtD() && typeIs(0, s16)(Query) &&
                 typeIs(1, s64)(Query));
       });
   getActionDefinitionsBuilder(G_FPEXT).legalIf(
       [=, &ST](const LegalityQuery &Query) -> bool {
         return (ST.hasStdExtD() && typeIs(0, s64)(Query) &&
                 typeIs(1, s32)(Query)) ||
                (ST.hasStdExtZfh() && typeIs(0, s32)(Query) &&
                 typeIs(1, s16)(Query)) ||
                (ST.hasStdExtZfh() && ST.hasStdExtD() && typeIs(0, s64)(Query) &&
                 typeIs(1, s16)(Query));
       });

   getActionDefinitionsBuilder(G_FCMP)
       .legalIf(all(typeIs(0, sXLen), typeIsScalarFPArith(1, ST)))
       .clampScalar(0, sXLen, sXLen);

   // TODO: Support vector version of G_IS_FPCLASS.
   getActionDefinitionsBuilder(G_IS_FPCLASS)
       .customIf(all(typeIs(0, s1), typeIsScalarFPArith(1, ST)));

   getActionDefinitionsBuilder(G_FCONSTANT)
       .legalIf(typeIsScalarFPArith(0, ST))
       .lowerFor({s32, s64});

   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
       .legalIf(all(typeInSet(0, {s32, sXLen}), typeIsScalarFPArith(1, ST)))
       .widenScalarToNextPow2(0)
       .minScalar(0, s32)
       .libcallFor({{s32, s32}, {s64, s32}, {s32, s64}, {s64, s64}})
       .libcallFor(ST.is64Bit(), {{s128, s32}, {s128, s64}});

   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
       .legalIf(all(typeIsScalarFPArith(0, ST), typeInSet(1, {s32, sXLen})))
       .widenScalarToNextPow2(1)
       .minScalar(1, s32)
       .libcallFor({{s32, s32}, {s64, s32}, {s32, s64}, {s64, s64}})
       .libcallFor(ST.is64Bit(), {{s32, s128}, {s64, s128}});

   // FIXME: We can do custom inline expansion like SelectionDAG.
   // FIXME: Legal with Zfa.
   getActionDefinitionsBuilder({G_FCEIL, G_FFLOOR, G_FRINT, G_FNEARBYINT,
                                G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND,
                                G_INTRINSIC_ROUNDEVEN})
       .libcallFor({s32, s64});

   getActionDefinitionsBuilder(G_VASTART).customFor({p0});

   // va_list must be a pointer, but most sized types are pretty easy to handle
   // as the destination.
   getActionDefinitionsBuilder(G_VAARG)
       // TODO: Implement narrowScalar and widenScalar for G_VAARG for types
       // other than sXLen.
       .clampScalar(0, sXLen, sXLen)
       .lowerForCartesianProduct({sXLen, p0}, {p0});

   getActionDefinitionsBuilder(G_VSCALE)
       .clampScalar(0, sXLen, sXLen)
       .customFor({sXLen});

   auto &SplatActions =
       getActionDefinitionsBuilder(G_SPLAT_VECTOR)
           .legalIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST),
                        typeIs(1, sXLen)))
           .customIf(all(typeIsLegalBoolVec(0, BoolVecTys, ST), typeIs(1, s1)));
   // Handle case of s64 element vectors on RV32. If the subtarget does not have
   // f64, then try to lower it to G_SPLAT_VECTOR_SPLIT_64_VL. If the subtarget
   // does have f64, then we don't know whether the type is an f64 or an i64,
   // so mark the G_SPLAT_VECTOR as legal and decide later what to do with it,
   // depending on how the instructions it consumes are legalized. They are not
   // legalized yet since legalization is in reverse postorder, so we cannot
   // make the decision at this moment.
   if (XLen == 32) {
     if (ST.hasVInstructionsF64() && ST.hasStdExtD())
       SplatActions.legalIf(all(
           typeInSet(0, {nxv1s64, nxv2s64, nxv4s64, nxv8s64}), typeIs(1, s64)));
     else if (ST.hasVInstructionsI64())
       SplatActions.customIf(all(
           typeInSet(0, {nxv1s64, nxv2s64, nxv4s64, nxv8s64}), typeIs(1, s64)));
   }

   SplatActions.clampScalar(1, sXLen, sXLen);

   LegalityPredicate ExtractSubvecBitcastPred = [=](const LegalityQuery &Query) {
     LLT DstTy = Query.Types[0];
     LLT SrcTy = Query.Types[1];
     return DstTy.getElementType() == LLT::scalar(1) &&
            DstTy.getElementCount().getKnownMinValue() >= 8 &&
            SrcTy.getElementCount().getKnownMinValue() >= 8;
   };
   getActionDefinitionsBuilder(G_EXTRACT_SUBVECTOR)
       // We don't have the ability to slide mask vectors down indexed by their
       // i1 elements; the smallest we can do is i8. Often we are able to bitcast
       // to equivalent i8 vectors.
       .bitcastIf(
           all(typeIsLegalBoolVec(0, BoolVecTys, ST),
               typeIsLegalBoolVec(1, BoolVecTys, ST), ExtractSubvecBitcastPred),
           [=](const LegalityQuery &Query) {
             LLT CastTy = LLT::vector(
                 Query.Types[0].getElementCount().divideCoefficientBy(8), 8);
             return std::pair(0, CastTy);
           })
       .customIf(LegalityPredicates::any(
           all(typeIsLegalBoolVec(0, BoolVecTys, ST),
               typeIsLegalBoolVec(1, BoolVecTys, ST)),
           all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST),
               typeIsLegalIntOrFPVec(1, IntOrFPVecTys, ST))));

   getActionDefinitionsBuilder(G_INSERT_SUBVECTOR)
       .customIf(all(typeIsLegalBoolVec(0, BoolVecTys, ST),
                     typeIsLegalBoolVec(1, BoolVecTys, ST)))
       .customIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST),
                     typeIsLegalIntOrFPVec(1, IntOrFPVecTys, ST)));

   getLegacyLegalizerInfo().computeTables();
 }

 bool RISCVLegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
                                            MachineInstr &MI) const {
   Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
   switch (IntrinsicID) {
   default:
     return false;
   case Intrinsic::vacopy: {
     // vacopy arguments must be legal because of the intrinsic signature.
     // No need to check here.

     MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
     MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
     MachineFunction &MF = *MI.getMF();
     const DataLayout &DL = MIRBuilder.getDataLayout();
     LLVMContext &Ctx = MF.getFunction().getContext();

     Register DstLst = MI.getOperand(1).getReg();
     LLT PtrTy = MRI.getType(DstLst);

     // Load the source va_list
     Align Alignment = DL.getABITypeAlign(getTypeForLLT(PtrTy, Ctx));
     MachineMemOperand *LoadMMO = MF.getMachineMemOperand(
         MachinePointerInfo(), MachineMemOperand::MOLoad, PtrTy, Alignment);
     auto Tmp = MIRBuilder.buildLoad(PtrTy, MI.getOperand(2), *LoadMMO);

     // Store the result in the destination va_list
     MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
         MachinePointerInfo(), MachineMemOperand::MOStore, PtrTy, Alignment);
     MIRBuilder.buildStore(Tmp, DstLst, *StoreMMO);

     MI.eraseFromParent();
     return true;
   }
   }
 }

 bool RISCVLegalizerInfo::legalizeVAStart(MachineInstr &MI,
                                          MachineIRBuilder &MIRBuilder) const {
   // Stores the address of the VarArgsFrameIndex slot into the memory location
   assert(MI.getOpcode() == TargetOpcode::G_VASTART);
   MachineFunction *MF = MI.getParent()->getParent();
   RISCVMachineFunctionInfo *FuncInfo = MF->getInfo<RISCVMachineFunctionInfo>();
   int FI = FuncInfo->getVarArgsFrameIndex();
   LLT AddrTy = MIRBuilder.getMRI()->getType(MI.getOperand(0).getReg());
   auto FINAddr = MIRBuilder.buildFrameIndex(AddrTy, FI);
   assert(MI.hasOneMemOperand());
   MIRBuilder.buildStore(FINAddr, MI.getOperand(0).getReg(),
                         *MI.memoperands()[0]);
   MI.eraseFromParent();
   return true;
 }

 bool RISCVLegalizerInfo::shouldBeInConstantPool(const APInt &APImm,
                                                 bool ShouldOptForSize) const {
   assert(APImm.getBitWidth() == 32 || APImm.getBitWidth() == 64);
   int64_t Imm = APImm.getSExtValue();
   // All simm32 constants should be handled by isel.
   // NOTE: The getMaxBuildIntsCost call below should return a value >= 2 making
   // this check redundant, but small immediates are common so this check
   // should have better compile time.
   if (isInt<32>(Imm))
     return false;

   // We only need to cost the immediate, if constant pool lowering is enabled.
   if (!STI.useConstantPoolForLargeInts())
     return false;

   RISCVMatInt::InstSeq Seq = RISCVMatInt::generateInstSeq(Imm, STI);
   if (Seq.size() <= STI.getMaxBuildIntsCost())
     return false;

   // Optimizations below are disabled for opt size. If we're optimizing for
   // size, use a constant pool.
   if (ShouldOptForSize)
     return true;
   //
   // Special case. See if we can build the constant as (ADD (SLLI X, C), X) do
   // that if it will avoid a constant pool.
   // It will require an extra temporary register though.
   // If we have Zba we can use (ADD_UW X, (SLLI X, 32)) to handle cases where
   // low and high 32 bits are the same and bit 31 and 63 are set.
   unsigned ShiftAmt, AddOpc;
   RISCVMatInt::InstSeq SeqLo =
       RISCVMatInt::generateTwoRegInstSeq(Imm, STI, ShiftAmt, AddOpc);
   return !(!SeqLo.empty() && (SeqLo.size() + 2) <= STI.getMaxBuildIntsCost());
 }

 bool RISCVLegalizerInfo::legalizeVScale(MachineInstr &MI,
                                         MachineIRBuilder &MIB) const {
   const LLT XLenTy(STI.getXLenVT());
   Register Dst = MI.getOperand(0).getReg();

   // We define our scalable vector types for lmul=1 to use a 64 bit known
   // minimum size. e.g. <vscale x 2 x i32>. VLENB is in bytes so we calculate
   // vscale as VLENB / 8.
   static_assert(RISCV::RVVBitsPerBlock == 64, "Unexpected bits per block!");
   if (STI.getRealMinVLen() < RISCV::RVVBitsPerBlock)
     // Support for VLEN==32 is incomplete.
     return false;

   // We assume VLENB is a multiple of 8. We manually choose the best shift
   // here because SimplifyDemandedBits isn't always able to simplify it.
   uint64_t Val = MI.getOperand(1).getCImm()->getZExtValue();
   if (isPowerOf2_64(Val)) {
     uint64_t Log2 = Log2_64(Val);
     if (Log2 < 3) {
       auto VLENB = MIB.buildInstr(RISCV::G_READ_VLENB, {XLenTy}, {});
       MIB.buildLShr(Dst, VLENB, MIB.buildConstant(XLenTy, 3 - Log2));
     } else if (Log2 > 3) {
       auto VLENB = MIB.buildInstr(RISCV::G_READ_VLENB, {XLenTy}, {});
       MIB.buildShl(Dst, VLENB, MIB.buildConstant(XLenTy, Log2 - 3));
     } else {
       MIB.buildInstr(RISCV::G_READ_VLENB, {Dst}, {});
     }
   } else if ((Val % 8) == 0) {
     // If the multiplier is a multiple of 8, scale it down to avoid needing
     // to shift the VLENB value.
     auto VLENB = MIB.buildInstr(RISCV::G_READ_VLENB, {XLenTy}, {});
     MIB.buildMul(Dst, VLENB, MIB.buildConstant(XLenTy, Val / 8));
   } else {
     auto VLENB = MIB.buildInstr(RISCV::G_READ_VLENB, {XLenTy}, {});
     auto VScale = MIB.buildLShr(XLenTy, VLENB, MIB.buildConstant(XLenTy, 3));
     MIB.buildMul(Dst, VScale, MIB.buildConstant(XLenTy, Val));
   }
   MI.eraseFromParent();
   return true;
 }

 // Custom-lower extensions from mask vectors by using a vselect either with 1
 // for zero/any-extension or -1 for sign-extension:
 //   (vXiN = (s|z)ext vXi1:vmask) -> (vXiN = vselect vmask, (-1 or 1), 0)
 // Note that any-extension is lowered identically to zero-extension.
 bool RISCVLegalizerInfo::legalizeExt(MachineInstr &MI,
                                      MachineIRBuilder &MIB) const {

   unsigned Opc = MI.getOpcode();
   assert(Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_SEXT ||
          Opc == TargetOpcode::G_ANYEXT);

   MachineRegisterInfo &MRI = *MIB.getMRI();
   Register Dst = MI.getOperand(0).getReg();
   Register Src = MI.getOperand(1).getReg();

   LLT DstTy = MRI.getType(Dst);
   int64_t ExtTrueVal = Opc == TargetOpcode::G_SEXT ? -1 : 1;
   LLT DstEltTy = DstTy.getElementType();
   auto SplatZero = MIB.buildSplatVector(DstTy, MIB.buildConstant(DstEltTy, 0));
   auto SplatTrue =
       MIB.buildSplatVector(DstTy, MIB.buildConstant(DstEltTy, ExtTrueVal));
   MIB.buildSelect(Dst, Src, SplatTrue, SplatZero);

   MI.eraseFromParent();
   return true;
 }

 bool RISCVLegalizerInfo::legalizeLoadStore(MachineInstr &MI,
                                            LegalizerHelper &Helper,
                                            MachineIRBuilder &MIB) const {
   assert((isa<GLoad>(MI) || isa<GStore>(MI)) &&
          "Machine instructions must be Load/Store.");
   MachineRegisterInfo &MRI = *MIB.getMRI();
   MachineFunction *MF = MI.getMF();
   const DataLayout &DL = MIB.getDataLayout();
   LLVMContext &Ctx = MF->getFunction().getContext();

   Register DstReg = MI.getOperand(0).getReg();
   LLT DataTy = MRI.getType(DstReg);
   if (!DataTy.isVector())
     return false;

   if (!MI.hasOneMemOperand())
     return false;

   MachineMemOperand *MMO = *MI.memoperands_begin();

   const auto *TLI = STI.getTargetLowering();
   EVT VT = EVT::getEVT(getTypeForLLT(DataTy, Ctx));

   if (TLI->allowsMemoryAccessForAlignment(Ctx, DL, VT, *MMO))
     return true;

   unsigned EltSizeBits = DataTy.getScalarSizeInBits();
   assert((EltSizeBits == 16 || EltSizeBits == 32 || EltSizeBits == 64) &&
          "Unexpected unaligned RVV load type");

   // Calculate the new vector type with i8 elements
   unsigned NumElements =
       DataTy.getElementCount().getKnownMinValue() * (EltSizeBits / 8);
   LLT NewDataTy = LLT::scalable_vector(NumElements, 8);

   Helper.bitcast(MI, 0, NewDataTy);

   return true;
 }

 /// Return the type of the mask type suitable for masking the provided
 /// vector type.  This is simply an i1 element type vector of the same
 /// (possibly scalable) length.
 static LLT getMaskTypeFor(LLT VecTy) {
   assert(VecTy.isVector());
   ElementCount EC = VecTy.getElementCount();
   return LLT::vector(EC, LLT::scalar(1));
 }

 /// Creates an all ones mask suitable for masking a vector of type VecTy with
 /// vector length VL.
 static MachineInstrBuilder buildAllOnesMask(LLT VecTy, const SrcOp &VL,
                                             MachineIRBuilder &MIB,
                                             MachineRegisterInfo &MRI) {
   LLT MaskTy = getMaskTypeFor(VecTy);
   return MIB.buildInstr(RISCV::G_VMSET_VL, {MaskTy}, {VL});
 }

 /// Gets the two common "VL" operands: an all-ones mask and the vector length.
 /// VecTy is a scalable vector type.
 static std::pair<MachineInstrBuilder, MachineInstrBuilder>
 buildDefaultVLOps(LLT VecTy, MachineIRBuilder &MIB, MachineRegisterInfo &MRI) {
   assert(VecTy.isScalableVector() && "Expecting scalable container type");
   const RISCVSubtarget &STI = MIB.getMF().getSubtarget<RISCVSubtarget>();
   LLT XLenTy(STI.getXLenVT());
   auto VL = MIB.buildConstant(XLenTy, -1);
   auto Mask = buildAllOnesMask(VecTy, VL, MIB, MRI);
   return {Mask, VL};
 }

 static MachineInstrBuilder
 buildSplatPartsS64WithVL(const DstOp &Dst, const SrcOp &Passthru, Register Lo,
                          Register Hi, const SrcOp &VL, MachineIRBuilder &MIB,
                          MachineRegisterInfo &MRI) {
   // TODO: If the Hi bits of the splat are undefined, then it's fine to just
   // splat Lo even if it might be sign extended. I don't think we have
   // introduced a case where we're build a s64 where the upper bits are undef
   // yet.

   // Fall back to a stack store and stride x0 vector load.
   // TODO: need to lower G_SPLAT_VECTOR_SPLIT_I64. This is done in
   // preprocessDAG in SDAG.
   return MIB.buildInstr(RISCV::G_SPLAT_VECTOR_SPLIT_I64_VL, {Dst},
                         {Passthru, Lo, Hi, VL});
 }

 static MachineInstrBuilder
 buildSplatSplitS64WithVL(const DstOp &Dst, const SrcOp &Passthru,
                          const SrcOp &Scalar, const SrcOp &VL,
                          MachineIRBuilder &MIB, MachineRegisterInfo &MRI) {
   assert(Scalar.getLLTTy(MRI) == LLT::scalar(64) && "Unexpected VecTy!");
   auto Unmerge = MIB.buildUnmerge(LLT::scalar(32), Scalar);
   return buildSplatPartsS64WithVL(Dst, Passthru, Unmerge.getReg(0),
                                   Unmerge.getReg(1), VL, MIB, MRI);
 }

 // Lower splats of s1 types to G_ICMP. For each mask vector type, we have a
 // legal equivalently-sized i8 type, so we can use that as a go-between.
 // Splats of s1 types that have constant value can be legalized as VMSET_VL or
 // VMCLR_VL.
 bool RISCVLegalizerInfo::legalizeSplatVector(MachineInstr &MI,
                                              MachineIRBuilder &MIB) const {
   assert(MI.getOpcode() == TargetOpcode::G_SPLAT_VECTOR);

   MachineRegisterInfo &MRI = *MIB.getMRI();

   Register Dst = MI.getOperand(0).getReg();
   Register SplatVal = MI.getOperand(1).getReg();

   LLT VecTy = MRI.getType(Dst);
   LLT XLenTy(STI.getXLenVT());

   // Handle case of s64 element vectors on rv32
   if (XLenTy.getSizeInBits() == 32 &&
       VecTy.getElementType().getSizeInBits() == 64) {
     auto [_, VL] = buildDefaultVLOps(MRI.getType(Dst), MIB, MRI);
     buildSplatSplitS64WithVL(Dst, MIB.buildUndef(VecTy), SplatVal, VL, MIB,
                              MRI);
     MI.eraseFromParent();
     return true;
   }

   // All-zeros or all-ones splats are handled specially.
   MachineInstr &SplatValMI = *MRI.getVRegDef(SplatVal);
   if (isAllOnesOrAllOnesSplat(SplatValMI, MRI)) {
     auto VL = buildDefaultVLOps(VecTy, MIB, MRI).second;
     MIB.buildInstr(RISCV::G_VMSET_VL, {Dst}, {VL});
     MI.eraseFromParent();
     return true;
   }
   if (isNullOrNullSplat(SplatValMI, MRI)) {
     auto VL = buildDefaultVLOps(VecTy, MIB, MRI).second;
     MIB.buildInstr(RISCV::G_VMCLR_VL, {Dst}, {VL});
     MI.eraseFromParent();
     return true;
   }

   // Handle non-constant mask splat (i.e. not sure if it's all zeros or all
   // ones) by promoting it to an s8 splat.
   LLT InterEltTy = LLT::scalar(8);
   LLT InterTy = VecTy.changeElementType(InterEltTy);
   auto ZExtSplatVal = MIB.buildZExt(InterEltTy, SplatVal);
   auto And =
       MIB.buildAnd(InterEltTy, ZExtSplatVal, MIB.buildConstant(InterEltTy, 1));
   auto LHS = MIB.buildSplatVector(InterTy, And);
   auto ZeroSplat =
       MIB.buildSplatVector(InterTy, MIB.buildConstant(InterEltTy, 0));
   MIB.buildICmp(CmpInst::Predicate::ICMP_NE, Dst, LHS, ZeroSplat);
   MI.eraseFromParent();
   return true;
 }

 static LLT getLMUL1Ty(LLT VecTy) {
   assert(VecTy.getElementType().getSizeInBits() <= 64 &&
          "Unexpected vector LLT");
   return LLT::scalable_vector(RISCV::RVVBitsPerBlock /
                                   VecTy.getElementType().getSizeInBits(),
                               VecTy.getElementType());
 }

 bool RISCVLegalizerInfo::legalizeExtractSubvector(MachineInstr &MI,
                                                   MachineIRBuilder &MIB) const {
   GExtractSubvector &ES = cast<GExtractSubvector>(MI);

   MachineRegisterInfo &MRI = *MIB.getMRI();

   Register Dst = ES.getReg(0);
   Register Src = ES.getSrcVec();
   uint64_t Idx = ES.getIndexImm();

   // With an index of 0 this is a cast-like subvector, which can be performed
   // with subregister operations.
   if (Idx == 0)
     return true;

   LLT LitTy = MRI.getType(Dst);
   LLT BigTy = MRI.getType(Src);

   if (LitTy.getElementType() == LLT::scalar(1)) {
     // We can't slide this mask vector up indexed by its i1 elements.
     // This poses a problem when we wish to insert a scalable vector which
     // can't be re-expressed as a larger type. Just choose the slow path and
     // extend to a larger type, then truncate back down.
     LLT ExtBigTy = BigTy.changeElementType(LLT::scalar(8));
     LLT ExtLitTy = LitTy.changeElementType(LLT::scalar(8));
     auto BigZExt = MIB.buildZExt(ExtBigTy, Src);
     auto ExtractZExt = MIB.buildExtractSubvector(ExtLitTy, BigZExt, Idx);
     auto SplatZero = MIB.buildSplatVector(
         ExtLitTy, MIB.buildConstant(ExtLitTy.getElementType(), 0));
     MIB.buildICmp(CmpInst::Predicate::ICMP_NE, Dst, ExtractZExt, SplatZero);
     MI.eraseFromParent();
     return true;
   }

   // extract_subvector scales the index by vscale if the subvector is scalable,
   // and decomposeSubvectorInsertExtractToSubRegs takes this into account.
   const RISCVRegisterInfo *TRI = STI.getRegisterInfo();
   MVT LitTyMVT = getMVTForLLT(LitTy);
   auto Decompose =
       RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
           getMVTForLLT(BigTy), LitTyMVT, Idx, TRI);
   unsigned RemIdx = Decompose.second;

   // If the Idx has been completely eliminated then this is a subvector extract
   // which naturally aligns to a vector register. These can easily be handled
   // using subregister manipulation.
   if (RemIdx == 0)
     return true;

   // Else LitTy is M1 or smaller and may need to be slid down: if LitTy
   // was > M1 then the index would need to be a multiple of VLMAX, and so would
   // divide exactly.
   assert(
       RISCVVType::decodeVLMUL(RISCVTargetLowering::getLMUL(LitTyMVT)).second ||
       RISCVTargetLowering::getLMUL(LitTyMVT) == RISCVII::VLMUL::LMUL_1);

   // If the vector type is an LMUL-group type, extract a subvector equal to the
   // nearest full vector register type.
   LLT InterLitTy = BigTy;
   Register Vec = Src;
   if (TypeSize::isKnownGT(BigTy.getSizeInBits(),
                           getLMUL1Ty(BigTy).getSizeInBits())) {
     // If BigTy has an LMUL > 1, then LitTy should have a smaller LMUL, and
     // we should have successfully decomposed the extract into a subregister.
     assert(Decompose.first != RISCV::NoSubRegister);
     InterLitTy = getLMUL1Ty(BigTy);
     // SDAG builds a TargetExtractSubreg. We cannot create a a Copy with SubReg
     // specified on the source Register (the equivalent) since generic virtual
     // register does not allow subregister index.
     Vec = MIB.buildExtractSubvector(InterLitTy, Src, Idx - RemIdx).getReg(0);
   }

   // Slide this vector register down by the desired number of elements in order
   // to place the desired subvector starting at element 0.
   const LLT XLenTy(STI.getXLenVT());
   auto SlidedownAmt = MIB.buildVScale(XLenTy, RemIdx);
   auto [Mask, VL] = buildDefaultVLOps(LitTy, MIB, MRI);
   uint64_t Policy = RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC;
   auto Slidedown = MIB.buildInstr(
       RISCV::G_VSLIDEDOWN_VL, {InterLitTy},
       {MIB.buildUndef(InterLitTy), Vec, SlidedownAmt, Mask, VL, Policy});

   // Now the vector is in the right position, extract our final subvector. This
   // should resolve to a COPY.
   MIB.buildExtractSubvector(Dst, Slidedown, 0);

   MI.eraseFromParent();
   return true;
 }

 bool RISCVLegalizerInfo::legalizeInsertSubvector(MachineInstr &MI,
                                                  LegalizerHelper &Helper,
                                                  MachineIRBuilder &MIB) const {
   GInsertSubvector &IS = cast<GInsertSubvector>(MI);

   MachineRegisterInfo &MRI = *MIB.getMRI();

   Register Dst = IS.getReg(0);
   Register BigVec = IS.getBigVec();
   Register LitVec = IS.getSubVec();
   uint64_t Idx = IS.getIndexImm();

   LLT BigTy = MRI.getType(BigVec);
   LLT LitTy = MRI.getType(LitVec);

   if (Idx == 0 ||
       MRI.getVRegDef(BigVec)->getOpcode() == TargetOpcode::G_IMPLICIT_DEF)
     return true;

   // We don't have the ability to slide mask vectors up indexed by their i1
   // elements; the smallest we can do is i8. Often we are able to bitcast to
   // equivalent i8 vectors. Otherwise, we can must zeroextend to equivalent i8
   // vectors and truncate down after the insert.
   if (LitTy.getElementType() == LLT::scalar(1)) {
     auto BigTyMinElts = BigTy.getElementCount().getKnownMinValue();
     auto LitTyMinElts = LitTy.getElementCount().getKnownMinValue();
     if (BigTyMinElts >= 8 && LitTyMinElts >= 8)
       return Helper.bitcast(
           IS, 0,
           LLT::vector(BigTy.getElementCount().divideCoefficientBy(8), 8));

     // We can't slide this mask vector up indexed by its i1 elements.
     // This poses a problem when we wish to insert a scalable vector which
     // can't be re-expressed as a larger type. Just choose the slow path and
     // extend to a larger type, then truncate back down.
     LLT ExtBigTy = BigTy.changeElementType(LLT::scalar(8));
     return Helper.widenScalar(IS, 0, ExtBigTy);
   }

   const RISCVRegisterInfo *TRI = STI.getRegisterInfo();
   unsigned SubRegIdx, RemIdx;
   std::tie(SubRegIdx, RemIdx) =
       RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
           getMVTForLLT(BigTy), getMVTForLLT(LitTy), Idx, TRI);

   TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock);
   assert(isPowerOf2_64(
       STI.expandVScale(LitTy.getSizeInBits()).getKnownMinValue()));
   bool ExactlyVecRegSized =
       STI.expandVScale(LitTy.getSizeInBits())
           .isKnownMultipleOf(STI.expandVScale(VecRegSize));

   // If the Idx has been completely eliminated and this subvector's size is a
   // vector register or a multiple thereof, or the surrounding elements are
   // undef, then this is a subvector insert which naturally aligns to a vector
   // register. These can easily be handled using subregister manipulation.
   if (RemIdx == 0 && ExactlyVecRegSized)
     return true;

   // If the subvector is smaller than a vector register, then the insertion
   // must preserve the undisturbed elements of the register. We do this by
   // lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector type
   // (which resolves to a subregister copy), performing a VSLIDEUP to place the
   // subvector within the vector register, and an INSERT_SUBVECTOR of that
   // LMUL=1 type back into the larger vector (resolving to another subregister
   // operation). See below for how our VSLIDEUP works. We go via a LMUL=1 type
   // to avoid allocating a large register group to hold our subvector.

   // VSLIDEUP works by leaving elements 0<i<OFFSET undisturbed, elements
   // OFFSET<=i<VL set to the "subvector" and vl<=i<VLMAX set to the tail policy
   // (in our case undisturbed). This means we can set up a subvector insertion
   // where OFFSET is the insertion offset, and the VL is the OFFSET plus the
   // size of the subvector.
   const LLT XLenTy(STI.getXLenVT());
   LLT InterLitTy = BigTy;
   Register AlignedExtract = BigVec;
   unsigned AlignedIdx = Idx - RemIdx;
   if (TypeSize::isKnownGT(BigTy.getSizeInBits(),
                           getLMUL1Ty(BigTy).getSizeInBits())) {
     InterLitTy = getLMUL1Ty(BigTy);
     // Extract a subvector equal to the nearest full vector register type. This
     // should resolve to a G_EXTRACT on a subreg.
     AlignedExtract =
         MIB.buildExtractSubvector(InterLitTy, BigVec, AlignedIdx).getReg(0);
   }

   auto Insert = MIB.buildInsertSubvector(InterLitTy, MIB.buildUndef(InterLitTy),
                                          LitVec, 0);

   auto [Mask, _] = buildDefaultVLOps(BigTy, MIB, MRI);
   auto VL = MIB.buildVScale(XLenTy, LitTy.getElementCount().getKnownMinValue());

   // If we're inserting into the lowest elements, use a tail undisturbed
   // vmv.v.v.
   MachineInstrBuilder Inserted;
   bool NeedInsertSubvec =
       TypeSize::isKnownGT(BigTy.getSizeInBits(), InterLitTy.getSizeInBits());
   Register InsertedDst =
       NeedInsertSubvec ? MRI.createGenericVirtualRegister(InterLitTy) : Dst;
   if (RemIdx == 0) {
     Inserted = MIB.buildInstr(RISCV::G_VMV_V_V_VL, {InsertedDst},
                               {AlignedExtract, Insert, VL});
   } else {
     auto SlideupAmt = MIB.buildVScale(XLenTy, RemIdx);
     // Construct the vector length corresponding to RemIdx + length(LitTy).
     VL = MIB.buildAdd(XLenTy, SlideupAmt, VL);
     // Use tail agnostic policy if we're inserting over InterLitTy's tail.
     ElementCount EndIndex =
         ElementCount::getScalable(RemIdx) + LitTy.getElementCount();
     uint64_t Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
     if (STI.expandVScale(EndIndex) ==
         STI.expandVScale(InterLitTy.getElementCount()))
       Policy = RISCVII::TAIL_AGNOSTIC;

     Inserted =
         MIB.buildInstr(RISCV::G_VSLIDEUP_VL, {InsertedDst},
                        {AlignedExtract, Insert, SlideupAmt, Mask, VL, Policy});
   }

   // If required, insert this subvector back into the correct vector register.
   // This should resolve to an INSERT_SUBREG instruction.
   if (NeedInsertSubvec)
     MIB.buildInsertSubvector(Dst, BigVec, Inserted, AlignedIdx);

   MI.eraseFromParent();
   return true;
 }

 bool RISCVLegalizerInfo::legalizeCustom(
     LegalizerHelper &Helper, MachineInstr &MI,
     LostDebugLocObserver &LocObserver) const {
   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
   MachineFunction &MF = *MI.getParent()->getParent();
   switch (MI.getOpcode()) {
   default:
     // No idea what to do.
     return false;
   case TargetOpcode::G_ABS:
     return Helper.lowerAbsToMaxNeg(MI);
   // TODO: G_FCONSTANT
   case TargetOpcode::G_CONSTANT: {
     const Function &F = MF.getFunction();
     // TODO: if PSI and BFI are present, add " ||
     // llvm::shouldOptForSize(*CurMBB, PSI, BFI)".
     bool ShouldOptForSize = F.hasOptSize() || F.hasMinSize();
     const ConstantInt *ConstVal = MI.getOperand(1).getCImm();
     if (!shouldBeInConstantPool(ConstVal->getValue(), ShouldOptForSize))
       return true;
     return Helper.lowerConstant(MI);
   }
   case TargetOpcode::G_SEXT_INREG: {
     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
     int64_t SizeInBits = MI.getOperand(2).getImm();
     // Source size of 32 is sext.w.
     if (DstTy.getSizeInBits() == 64 && SizeInBits == 32)
       return true;

     if (STI.hasStdExtZbb() && (SizeInBits == 8 || SizeInBits == 16))
       return true;

     return Helper.lower(MI, 0, /* Unused hint type */ LLT()) ==
            LegalizerHelper::Legalized;
   }
   case TargetOpcode::G_IS_FPCLASS: {
     Register GISFPCLASS = MI.getOperand(0).getReg();
     Register Src = MI.getOperand(1).getReg();
     const MachineOperand &ImmOp = MI.getOperand(2);
     MachineIRBuilder MIB(MI);

     // Turn LLVM IR's floating point classes to that in RISC-V,
     // by simply rotating the 10-bit immediate right by two bits.
     APInt GFpClassImm(10, static_cast<uint64_t>(ImmOp.getImm()));
     auto FClassMask = MIB.buildConstant(sXLen, GFpClassImm.rotr(2).zext(XLen));
     auto ConstZero = MIB.buildConstant(sXLen, 0);

     auto GFClass = MIB.buildInstr(RISCV::G_FCLASS, {sXLen}, {Src});
     auto And = MIB.buildAnd(sXLen, GFClass, FClassMask);
     MIB.buildICmp(CmpInst::ICMP_NE, GISFPCLASS, And, ConstZero);

     MI.eraseFromParent();
     return true;
   }
   case TargetOpcode::G_VASTART:
     return legalizeVAStart(MI, MIRBuilder);
   case TargetOpcode::G_VSCALE:
     return legalizeVScale(MI, MIRBuilder);
   case TargetOpcode::G_ZEXT:
   case TargetOpcode::G_SEXT:
   case TargetOpcode::G_ANYEXT:
     return legalizeExt(MI, MIRBuilder);
   case TargetOpcode::G_SPLAT_VECTOR:
     return legalizeSplatVector(MI, MIRBuilder);
   case TargetOpcode::G_EXTRACT_SUBVECTOR:
     return legalizeExtractSubvector(MI, MIRBuilder);
   case TargetOpcode::G_INSERT_SUBVECTOR:
     return legalizeInsertSubvector(MI, Helper, MIRBuilder);
   case TargetOpcode::G_LOAD:
   case TargetOpcode::G_STORE:
     return legalizeLoadStore(MI, Helper, MIRBuilder);
   }

   llvm_unreachable("expected switch to return");
 }