llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp - llvm-project - Git at Google

 //=- WebAssemblyFixIrreducibleControlFlow.cpp - Fix irreducible control flow -//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 ///
 /// \file
 /// This file implements a pass that removes irreducible control flow.
 /// Irreducible control flow means multiple-entry loops, which this pass
 /// transforms to have a single entry.
 ///
 /// Note that LLVM has a generic pass that lowers irreducible control flow, but
 /// it linearizes control flow, turning diamonds into two triangles, which is
 /// both unnecessary and undesirable for WebAssembly.
 ///
 /// The big picture: We recursively process each "region", defined as a group
 /// of blocks with a single entry and no branches back to that entry. A region
 /// may be the entire function body, or the inner part of a loop, i.e., the
 /// loop's body without branches back to the loop entry. In each region we
 /// identify all the strongly-connected components (SCCs). We fix up multi-entry
 /// loops (SCCs) by adding a new block that can dispatch to each of the loop
 /// entries, based on the value of a label "helper" variable, and we replace
 /// direct branches to the entries with assignments to the label variable and a
 /// branch to the dispatch block. Then the dispatch block is the single entry in
 /// the loop containing the previous multiple entries. Each time we fix some
 /// irreducibility, we recalculate the SCCs. After ensuring all the SCCs in a
 /// region are reducible, we recurse into them. The total time complexity of
 /// this pass is roughly:
 /// O((NumBlocks + NumEdges) * (NumNestedLoops + NumIrreducibleLoops))
 ///
 /// This pass is similar to what the Relooper [1] does. Both identify looping
 /// code that requires multiple entries, and resolve it in a similar way (in
 /// Relooper terminology, we implement a Multiple shape in a Loop shape). Note
 /// also that like the Relooper, we implement a "minimal" intervention: we only
 /// use the "label" helper for the blocks we absolutely must and no others. We
 /// also prioritize code size and do not duplicate code in order to resolve
 /// irreducibility. The graph algorithms for finding loops and entries and so
 /// forth are also similar to the Relooper. The main differences between this
 /// pass and the Relooper are:
 ///
 ///  * We just care about irreducibility, so we just look at loops.
 ///  * The Relooper emits structured control flow (with ifs etc.), while we
 ///    emit a CFG.
 ///
 /// [1] Alon Zakai. 2011. Emscripten: an LLVM-to-JavaScript compiler. In
 /// Proceedings of the ACM international conference companion on Object oriented
 /// programming systems languages and applications companion (SPLASH '11). ACM,
 /// New York, NY, USA, 301-312. DOI=10.1145/2048147.2048224
 /// http://doi.acm.org/10.1145/2048147.2048224
 ///
 //===----------------------------------------------------------------------===//

 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "WebAssembly.h"
 #include "WebAssemblySubtarget.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/Debug.h"
 #include <limits>
 using namespace llvm;

 #define DEBUG_TYPE "wasm-fix-irreducible-control-flow"

 namespace {

 using BlockVector = SmallVector<MachineBasicBlock *, 4>;
 using BlockSet = SmallPtrSet<MachineBasicBlock *, 4>;

 static BlockVector getSortedEntries(const BlockSet &Entries) {
   BlockVector SortedEntries(Entries.begin(), Entries.end());
   llvm::sort(SortedEntries,
              [](const MachineBasicBlock *A, const MachineBasicBlock *B) {
                auto ANum = A->getNumber();
                auto BNum = B->getNumber();
                return ANum < BNum;
              });
   return SortedEntries;
 }

 struct ReachabilityNode {
   MachineBasicBlock *MBB;
   SmallVector<ReachabilityNode *, 4> Succs;
   unsigned SCCId = std::numeric_limits<unsigned>::max();
 };

 // Analyzes the SCC (strongly-connected component) structure in a region.
 // Ignores branches to blocks outside of the region, and ignores branches to the
 // region entry (for the case where the region is the inner part of a loop).
 class ReachabilityGraph {
 public:
   ReachabilityGraph(MachineBasicBlock *Entry, const BlockSet &Blocks)
       : Entry(Entry), Blocks(Blocks) {
 #ifndef NDEBUG
     // The region must have a single entry.
     for (auto *MBB : Blocks) {
       if (MBB != Entry) {
         for (auto *Pred : MBB->predecessors()) {
           assert(inRegion(Pred));
         }
       }
     }
 #endif
     calculate();
   }

   // Get all blocks that are loop entries.
   const BlockSet &getLoopEntries() const { return LoopEntries; }
   const BlockSet &getLoopEntriesForSCC(unsigned SCCId) const {
     return LoopEntriesBySCC[SCCId];
   }

   unsigned getSCCId(MachineBasicBlock *MBB) const {
     return getNode(MBB)->SCCId;
   }

   friend struct GraphTraits<ReachabilityGraph *>;

 private:
   MachineBasicBlock *Entry;
   const BlockSet &Blocks;

   BlockSet LoopEntries;
   SmallVector<BlockSet, 0> LoopEntriesBySCC;

   bool inRegion(MachineBasicBlock *MBB) const { return Blocks.count(MBB); }

   SmallVector<ReachabilityNode, 0> Nodes;
   DenseMap<MachineBasicBlock *, ReachabilityNode *> MBBToNodeMap;

   ReachabilityNode *getNode(MachineBasicBlock *MBB) const {
     return MBBToNodeMap.at(MBB);
   }

   void calculate();
 };
 } // end anonymous namespace

 namespace llvm {
 template <> struct GraphTraits<ReachabilityGraph *> {
   using NodeRef = ReachabilityNode *;
   using ChildIteratorType = SmallVectorImpl<NodeRef>::iterator;

   static NodeRef getEntryNode(ReachabilityGraph *G) {
     return G->getNode(G->Entry);
   }

   static inline ChildIteratorType child_begin(NodeRef N) {
     return N->Succs.begin();
   }

   static inline ChildIteratorType child_end(NodeRef N) {
     return N->Succs.end();
   }
 };
 } // end namespace llvm

 namespace {

 void ReachabilityGraph::calculate() {
   auto NumBlocks = Blocks.size();
   Nodes.assign(NumBlocks, {});

   MBBToNodeMap.clear();
   MBBToNodeMap.reserve(NumBlocks);

   // Initialize mappings.
   unsigned MBBIdx = 0;
   for (auto *MBB : Blocks) {
     auto &Node = Nodes[MBBIdx++];

     Node.MBB = MBB;
     MBBToNodeMap[MBB] = &Node;
   }

   // Add all relevant direct branches.
   MBBIdx = 0;
   for (auto *MBB : Blocks) {
     auto &Node = Nodes[MBBIdx++];

     for (auto *Succ : MBB->successors()) {
       if (Succ != Entry && inRegion(Succ)) {
         Node.Succs.push_back(getNode(Succ));
       }
     }
   }

   unsigned CurrSCCIdx = 0;
   for (auto &SCC : make_range(scc_begin(this), scc_end(this))) {
     LoopEntriesBySCC.push_back({});
     auto &SCCLoopEntries = LoopEntriesBySCC.back();

     for (auto *Node : SCC) {
       // Make sure nodes are only ever assigned one SCC
       assert(Node->SCCId == std::numeric_limits<unsigned>::max());

       Node->SCCId = CurrSCCIdx;
     }

     bool SelfLoop = false;
     if (SCC.size() == 1) {
       auto &Node = SCC[0];

       for (auto *Succ : Node->Succs) {
         if (Succ == Node) {
           SelfLoop = true;
           break;
         }
       }
     }

     // Blocks outside any (multi-block) loop will be isolated in their own
     // single-element SCC. Thus blocks that are in a loop are those in
     // multi-element SCCs or are self-looping.
     if (SCC.size() > 1 || SelfLoop) {
       // Find the loop entries - loop body blocks with predecessors outside
       // their SCC
       for (auto *Node : SCC) {
         if (Node->MBB == Entry)
           continue;

         for (auto *Pred : Node->MBB->predecessors()) {
           // This test is accurate despite not having assigned all nodes an SCC
           // yet. We only care if a node has been assigned into this SCC or not.
           if (getSCCId(Pred) != CurrSCCIdx) {
             LoopEntries.insert(Node->MBB);
             SCCLoopEntries.insert(Node->MBB);
           }
         }
       }
     }
     ++CurrSCCIdx;
   }

 #ifndef NDEBUG
   // Make sure all nodes have been processed
   for (auto &Node : Nodes) {
     assert(Node.SCCId != std::numeric_limits<unsigned>::max());
   }
 #endif
 }

 class WebAssemblyFixIrreducibleControlFlow final : public MachineFunctionPass {
   StringRef getPassName() const override {
     return "WebAssembly Fix Irreducible Control Flow";
   }

   bool runOnMachineFunction(MachineFunction &MF) override;

   bool processRegion(MachineBasicBlock *Entry, BlockSet &Blocks,
                      MachineFunction &MF);

   void makeSingleEntryLoop(const BlockSet &Entries, BlockSet &Blocks,
                            MachineFunction &MF, const ReachabilityGraph &Graph);

 public:
   static char ID; // Pass identification, replacement for typeid
   WebAssemblyFixIrreducibleControlFlow() : MachineFunctionPass(ID) {}
 };

 bool WebAssemblyFixIrreducibleControlFlow::processRegion(
     MachineBasicBlock *Entry, BlockSet &Blocks, MachineFunction &MF) {
   bool Changed = false;
   // Remove irreducibility before processing child loops, which may take
   // multiple iterations.
   while (true) {
     ReachabilityGraph Graph(Entry, Blocks);

     bool FoundIrreducibility = false;

     for (auto *LoopEntry : getSortedEntries(Graph.getLoopEntries())) {
       // Find mutual entries - all entries which can reach this one, and
       // are reached by it (that always includes LoopEntry itself). All mutual
       // entries must be in the same SCC, so if we have more than one, then we
       // have irreducible control flow.
       //
       // (Note that we need to sort the entries here, as otherwise the order can
       // matter: being mutual is a symmetric relationship, and each set of
       // mutuals will be handled properly no matter which we see first. However,
       // there can be multiple disjoint sets of mutuals, and which we process
       // first changes the output.)
       //
       // Note that irreducibility may involve inner loops, e.g. imagine A
       // starts one loop, and it has B inside it which starts an inner loop.
       // If we add a branch from all the way on the outside to B, then in a
       // sense B is no longer an "inner" loop, semantically speaking. We will
       // fix that irreducibility by adding a block that dispatches to either
       // either A or B, so B will no longer be an inner loop in our output.
       // (A fancier approach might try to keep it as such.)
       //
       // Note that we still need to recurse into inner loops later, to handle
       // the case where the irreducibility is entirely nested - we would not
       // be able to identify that at this point, since the enclosing loop is
       // a group of blocks all of whom can reach each other. (We'll see the
       // irreducibility after removing branches to the top of that enclosing
       // loop.)
       auto &MutualLoopEntries =
           Graph.getLoopEntriesForSCC(Graph.getSCCId(LoopEntry));

       if (MutualLoopEntries.size() > 1) {
         makeSingleEntryLoop(MutualLoopEntries, Blocks, MF, Graph);
         FoundIrreducibility = true;
         Changed = true;
         break;
       }
     }

     // Only go on to actually process the inner loops when we are done
     // removing irreducible control flow and changing the graph. Modifying
     // the graph as we go is possible, and that might let us avoid looking at
     // the already-fixed loops again if we are careful, but all that is
     // complex and bug-prone. Since irreducible loops are rare, just starting
     // another iteration is best.
     if (FoundIrreducibility) {
       continue;
     }

     for (auto *LoopEntry : Graph.getLoopEntries()) {
       BlockSet InnerBlocks;

       auto EntrySCCId = Graph.getSCCId(LoopEntry);
       for (auto *Block : Blocks) {
         if (EntrySCCId == Graph.getSCCId(Block)) {
           InnerBlocks.insert(Block);
         }
       }

       // Each of these calls to processRegion may change the graph, but are
       // guaranteed not to interfere with each other. The only changes we make
       // to the graph are to add blocks on the way to a loop entry. As the
       // loops are disjoint, that means we may only alter branches that exit
       // another loop, which are ignored when recursing into that other loop
       // anyhow.
       if (processRegion(LoopEntry, InnerBlocks, MF)) {
         Changed = true;
       }
     }

     return Changed;
   }
 }

 // Given a set of entries to a single loop, create a single entry for that
 // loop by creating a dispatch block for them, routing control flow using
 // a helper variable. Also updates Blocks with any new blocks created, so
 // that we properly track all the blocks in the region. But this does not update
 // ReachabilityGraph; this will be updated in the caller of this function as
 // needed.
 void WebAssemblyFixIrreducibleControlFlow::makeSingleEntryLoop(
     const BlockSet &Entries, BlockSet &Blocks, MachineFunction &MF,
     const ReachabilityGraph &Graph) {
   assert(Entries.size() >= 2);

   // Sort the entries to ensure a deterministic build.
   BlockVector SortedEntries = getSortedEntries(Entries);

 #ifndef NDEBUG
   for (auto *Block : SortedEntries)
     assert(Block->getNumber() != -1);
   if (SortedEntries.size() > 1) {
     for (auto I = SortedEntries.begin(), E = SortedEntries.end() - 1; I != E;
          ++I) {
       auto ANum = (*I)->getNumber();
       auto BNum = (*(std::next(I)))->getNumber();
       assert(ANum != BNum);
     }
   }
 #endif

   // Create a dispatch block which will contain a jump table to the entries.
   MachineBasicBlock *Dispatch = MF.CreateMachineBasicBlock();
   MF.insert(MF.end(), Dispatch);
   Blocks.insert(Dispatch);

   // Add the jump table.
   const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
   MachineInstrBuilder MIB =
       BuildMI(Dispatch, DebugLoc(), TII.get(WebAssembly::BR_TABLE_I32));

   // Add the register which will be used to tell the jump table which block to
   // jump to.
   MachineRegisterInfo &MRI = MF.getRegInfo();
   Register Reg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
   MIB.addReg(Reg);

   // Compute the indices in the superheader, one for each bad block, and
   // add them as successors.
   DenseMap<MachineBasicBlock *, unsigned> Indices;
   for (auto *Entry : SortedEntries) {
     auto Pair = Indices.try_emplace(Entry);
     assert(Pair.second);

     unsigned Index = MIB.getInstr()->getNumExplicitOperands() - 1;
     Pair.first->second = Index;

     MIB.addMBB(Entry);
     Dispatch->addSuccessor(Entry);
   }

   // Rewrite the problematic successors for every block that wants to reach
   // the bad blocks. For simplicity, we just introduce a new block for every
   // edge we need to rewrite. (Fancier things are possible.)

   BlockVector AllPreds;
   for (auto *Entry : SortedEntries) {
     for (auto *Pred : Entry->predecessors()) {
       if (Pred != Dispatch) {
         AllPreds.push_back(Pred);
       }
     }
   }

   // This set stores predecessors within this loop.
   DenseSet<MachineBasicBlock *> InLoop;
   for (auto *Pred : AllPreds) {
     auto PredSCCId = Graph.getSCCId(Pred);

     for (auto *Entry : Pred->successors()) {
       if (!Entries.count(Entry))
         continue;
       if (Graph.getSCCId(Entry) == PredSCCId) {
         InLoop.insert(Pred);
         break;
       }
     }
   }

   // Record if each entry has a layout predecessor. This map stores
   // <<loop entry, Predecessor is within the loop?>, layout predecessor>
   DenseMap<PointerIntPair<MachineBasicBlock *, 1, bool>, MachineBasicBlock *>
       EntryToLayoutPred;
   for (auto *Pred : AllPreds) {
     bool PredInLoop = InLoop.count(Pred);
     for (auto *Entry : Pred->successors())
       if (Entries.count(Entry) && Pred->isLayoutSuccessor(Entry))
         EntryToLayoutPred[{Entry, PredInLoop}] = Pred;
   }

   // We need to create at most two routing blocks per entry: one for
   // predecessors outside the loop and one for predecessors inside the loop.
   // This map stores
   // <<loop entry, Predecessor is within the loop?>, routing block>
   DenseMap<PointerIntPair<MachineBasicBlock *, 1, bool>, MachineBasicBlock *>
       Map;
   for (auto *Pred : AllPreds) {
     bool PredInLoop = InLoop.count(Pred);
     for (auto *Entry : Pred->successors()) {
       if (!Entries.count(Entry) || Map.count({Entry, PredInLoop}))
         continue;
       // If there exists a layout predecessor of this entry and this predecessor
       // is not that, we rather create a routing block after that layout
       // predecessor to save a branch.
       if (auto *OtherPred = EntryToLayoutPred.lookup({Entry, PredInLoop}))
         if (OtherPred != Pred)
           continue;

       // This is a successor we need to rewrite.
       MachineBasicBlock *Routing = MF.CreateMachineBasicBlock();
       MF.insert(Pred->isLayoutSuccessor(Entry)
                     ? MachineFunction::iterator(Entry)
                     : MF.end(),
                 Routing);
       Blocks.insert(Routing);

       // Set the jump table's register of the index of the block we wish to
       // jump to, and jump to the jump table.
       BuildMI(Routing, DebugLoc(), TII.get(WebAssembly::CONST_I32), Reg)
           .addImm(Indices[Entry]);
       BuildMI(Routing, DebugLoc(), TII.get(WebAssembly::BR)).addMBB(Dispatch);
       Routing->addSuccessor(Dispatch);
       Map[{Entry, PredInLoop}] = Routing;
     }
   }

   for (auto *Pred : AllPreds) {
     bool PredInLoop = InLoop.count(Pred);
     // Remap the terminator operands and the successor list.
     for (MachineInstr &Term : Pred->terminators())
       for (auto &Op : Term.explicit_uses())
         if (Op.isMBB() && Indices.count(Op.getMBB()))
           Op.setMBB(Map[{Op.getMBB(), PredInLoop}]);

     for (auto *Succ : Pred->successors()) {
       if (!Entries.count(Succ))
         continue;
       auto *Routing = Map[{Succ, PredInLoop}];
       Pred->replaceSuccessor(Succ, Routing);
     }
   }

   // Create a fake default label, because br_table requires one.
   MIB.addMBB(MIB.getInstr()
                  ->getOperand(MIB.getInstr()->getNumExplicitOperands() - 1)
                  .getMBB());
 }

 } // end anonymous namespace

 char WebAssemblyFixIrreducibleControlFlow::ID = 0;
 INITIALIZE_PASS(WebAssemblyFixIrreducibleControlFlow, DEBUG_TYPE,
                 "Removes irreducible control flow", false, false)

 FunctionPass *llvm::createWebAssemblyFixIrreducibleControlFlow() {
   return new WebAssemblyFixIrreducibleControlFlow();
 }

 // Test whether the given register has an ARGUMENT def.
 static bool hasArgumentDef(unsigned Reg, const MachineRegisterInfo &MRI) {
   for (const auto &Def : MRI.def_instructions(Reg))
     if (WebAssembly::isArgument(Def.getOpcode()))
       return true;
   return false;
 }

 // Add a register definition with IMPLICIT_DEFs for every register to cover for
 // register uses that don't have defs in every possible path.
 // TODO: This is fairly heavy-handed; find a better approach.
 static void addImplicitDefs(MachineFunction &MF) {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
   MachineBasicBlock &Entry = *MF.begin();
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) {
     Register Reg = Register::index2VirtReg(I);

     // Skip unused registers.
     if (MRI.use_nodbg_empty(Reg))
       continue;

     // Skip registers that have an ARGUMENT definition.
     if (hasArgumentDef(Reg, MRI))
       continue;

     BuildMI(Entry, Entry.begin(), DebugLoc(),
             TII.get(WebAssembly::IMPLICIT_DEF), Reg);
   }

   // Move ARGUMENT_* instructions to the top of the entry block, so that their
   // liveness reflects the fact that these really are live-in values.
   for (MachineInstr &MI : llvm::make_early_inc_range(Entry)) {
     if (WebAssembly::isArgument(MI.getOpcode())) {
       MI.removeFromParent();
       Entry.insert(Entry.begin(), &MI);
     }
   }
 }

 bool WebAssemblyFixIrreducibleControlFlow::runOnMachineFunction(
     MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "********** Fixing Irreducible Control Flow **********\n"
                        "********** Function: "
                     << MF.getName() << '\n');

   // Start the recursive process on the entire function body.
   BlockSet AllBlocks;
   for (auto &MBB : MF) {
     AllBlocks.insert(&MBB);
   }

   if (LLVM_UNLIKELY(processRegion(&*MF.begin(), AllBlocks, MF))) {
     // We rewrote part of the function; recompute relevant things.
     MF.RenumberBlocks();
     // Now we've inserted dispatch blocks, some register uses can have incoming
     // paths without a def. For example, before this pass register %a was
     // defined in BB1 and used in BB2, and there was only one path from BB1 and
     // BB2. But if this pass inserts a dispatch block having multiple
     // predecessors between the two BBs, now there are paths to BB2 without
     // visiting BB1, and %a's use in BB2 is not dominated by its def. Adding
     // IMPLICIT_DEFs to all regs is one simple way to fix it.
     addImplicitDefs(MF);
     return true;
   }

   return false;
 }
	//=- WebAssemblyFixIrreducibleControlFlow.cpp - Fix irreducible control flow -//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	///
	/// \file
	/// This file implements a pass that removes irreducible control flow.
	/// Irreducible control flow means multiple-entry loops, which this pass
	/// transforms to have a single entry.
	///
	/// Note that LLVM has a generic pass that lowers irreducible control flow, but
	/// it linearizes control flow, turning diamonds into two triangles, which is
	/// both unnecessary and undesirable for WebAssembly.
	///
	/// The big picture: We recursively process each "region", defined as a group
	/// of blocks with a single entry and no branches back to that entry. A region
	/// may be the entire function body, or the inner part of a loop, i.e., the
	/// loop's body without branches back to the loop entry. In each region we
	/// identify all the strongly-connected components (SCCs). We fix up multi-entry
	/// loops (SCCs) by adding a new block that can dispatch to each of the loop
	/// entries, based on the value of a label "helper" variable, and we replace
	/// direct branches to the entries with assignments to the label variable and a
	/// branch to the dispatch block. Then the dispatch block is the single entry in
	/// the loop containing the previous multiple entries. Each time we fix some
	/// irreducibility, we recalculate the SCCs. After ensuring all the SCCs in a
	/// region are reducible, we recurse into them. The total time complexity of
	/// this pass is roughly:
	/// O((NumBlocks + NumEdges) * (NumNestedLoops + NumIrreducibleLoops))
	///
	/// This pass is similar to what the Relooper [1] does. Both identify looping
	/// code that requires multiple entries, and resolve it in a similar way (in
	/// Relooper terminology, we implement a Multiple shape in a Loop shape). Note
	/// also that like the Relooper, we implement a "minimal" intervention: we only
	/// use the "label" helper for the blocks we absolutely must and no others. We
	/// also prioritize code size and do not duplicate code in order to resolve
	/// irreducibility. The graph algorithms for finding loops and entries and so
	/// forth are also similar to the Relooper. The main differences between this
	/// pass and the Relooper are:
	///
	/// * We just care about irreducibility, so we just look at loops.
	/// * The Relooper emits structured control flow (with ifs etc.), while we
	/// emit a CFG.
	///
	/// [1] Alon Zakai. 2011. Emscripten: an LLVM-to-JavaScript compiler. In
	/// Proceedings of the ACM international conference companion on Object oriented
	/// programming systems languages and applications companion (SPLASH '11). ACM,
	/// New York, NY, USA, 301-312. DOI=10.1145/2048147.2048224
	/// http://doi.acm.org/10.1145/2048147.2048224
	///
	//===----------------------------------------------------------------------===//

	#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
	#include "WebAssembly.h"
	#include "WebAssemblySubtarget.h"
	#include "llvm/ADT/SCCIterator.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/Support/Debug.h"
	#include <limits>
	using namespace llvm;

	#define DEBUG_TYPE "wasm-fix-irreducible-control-flow"

	namespace {

	using BlockVector = SmallVector<MachineBasicBlock *, 4>;
	using BlockSet = SmallPtrSet<MachineBasicBlock *, 4>;

	static BlockVector getSortedEntries(const BlockSet &Entries) {
	BlockVector SortedEntries(Entries.begin(), Entries.end());
	llvm::sort(SortedEntries,
	[](const MachineBasicBlock A, const MachineBasicBlock B) {
	auto ANum = A->getNumber();
	auto BNum = B->getNumber();
	return ANum < BNum;
	});
	return SortedEntries;
	}

	struct ReachabilityNode {
	MachineBasicBlock *MBB;
	SmallVector<ReachabilityNode *, 4> Succs;
	unsigned SCCId = std::numeric_limits<unsigned>::max();
	};

	// Analyzes the SCC (strongly-connected component) structure in a region.
	// Ignores branches to blocks outside of the region, and ignores branches to the
	// region entry (for the case where the region is the inner part of a loop).
	class ReachabilityGraph {
	public:
	ReachabilityGraph(MachineBasicBlock *Entry, const BlockSet &Blocks)
	: Entry(Entry), Blocks(Blocks) {
	#ifndef NDEBUG
	// The region must have a single entry.
	for (auto *MBB : Blocks) {
	if (MBB != Entry) {
	for (auto *Pred : MBB->predecessors()) {
	assert(inRegion(Pred));
	}
	}
	}
	#endif
	calculate();
	}

	// Get all blocks that are loop entries.
	const BlockSet &getLoopEntries() const { return LoopEntries; }
	const BlockSet &getLoopEntriesForSCC(unsigned SCCId) const {
	return LoopEntriesBySCC[SCCId];
	}

	unsigned getSCCId(MachineBasicBlock *MBB) const {
	return getNode(MBB)->SCCId;
	}

	friend struct GraphTraits<ReachabilityGraph *>;

	private:
	MachineBasicBlock *Entry;
	const BlockSet &Blocks;

	BlockSet LoopEntries;
	SmallVector<BlockSet, 0> LoopEntriesBySCC;

	bool inRegion(MachineBasicBlock *MBB) const { return Blocks.count(MBB); }

	SmallVector<ReachabilityNode, 0> Nodes;
	DenseMap<MachineBasicBlock , ReachabilityNode > MBBToNodeMap;

	ReachabilityNode getNode(MachineBasicBlock MBB) const {
	return MBBToNodeMap.at(MBB);
	}

	void calculate();
	};
	} // end anonymous namespace

	namespace llvm {
	template <> struct GraphTraits<ReachabilityGraph *> {
	using NodeRef = ReachabilityNode *;
	using ChildIteratorType = SmallVectorImpl<NodeRef>::iterator;

	static NodeRef getEntryNode(ReachabilityGraph *G) {
	return G->getNode(G->Entry);
	}

	static inline ChildIteratorType child_begin(NodeRef N) {
	return N->Succs.begin();
	}

	static inline ChildIteratorType child_end(NodeRef N) {
	return N->Succs.end();
	}
	};
	} // end namespace llvm

	namespace {

	void ReachabilityGraph::calculate() {
	auto NumBlocks = Blocks.size();
	Nodes.assign(NumBlocks, {});

	MBBToNodeMap.clear();
	MBBToNodeMap.reserve(NumBlocks);

	// Initialize mappings.
	unsigned MBBIdx = 0;
	for (auto *MBB : Blocks) {
	auto &Node = Nodes[MBBIdx++];

	Node.MBB = MBB;
	MBBToNodeMap[MBB] = &Node;
	}

	// Add all relevant direct branches.
	MBBIdx = 0;
	for (auto *MBB : Blocks) {
	auto &Node = Nodes[MBBIdx++];

	for (auto *Succ : MBB->successors()) {
	if (Succ != Entry && inRegion(Succ)) {
	Node.Succs.push_back(getNode(Succ));
	}
	}
	}

	unsigned CurrSCCIdx = 0;
	for (auto &SCC : make_range(scc_begin(this), scc_end(this))) {
	LoopEntriesBySCC.push_back({});
	auto &SCCLoopEntries = LoopEntriesBySCC.back();

	for (auto *Node : SCC) {
	// Make sure nodes are only ever assigned one SCC
	assert(Node->SCCId == std::numeric_limits<unsigned>::max());

	Node->SCCId = CurrSCCIdx;
	}

	bool SelfLoop = false;
	if (SCC.size() == 1) {
	auto &Node = SCC[0];

	for (auto *Succ : Node->Succs) {
	if (Succ == Node) {
	SelfLoop = true;
	break;
	}
	}
	}

	// Blocks outside any (multi-block) loop will be isolated in their own
	// single-element SCC. Thus blocks that are in a loop are those in
	// multi-element SCCs or are self-looping.
	if (SCC.size() > 1 \|\| SelfLoop) {
	// Find the loop entries - loop body blocks with predecessors outside
	// their SCC
	for (auto *Node : SCC) {
	if (Node->MBB == Entry)
	continue;

	for (auto *Pred : Node->MBB->predecessors()) {
	// This test is accurate despite not having assigned all nodes an SCC
	// yet. We only care if a node has been assigned into this SCC or not.
	if (getSCCId(Pred) != CurrSCCIdx) {
	LoopEntries.insert(Node->MBB);
	SCCLoopEntries.insert(Node->MBB);
	}
	}
	}
	}
	++CurrSCCIdx;
	}

	#ifndef NDEBUG
	// Make sure all nodes have been processed
	for (auto &Node : Nodes) {
	assert(Node.SCCId != std::numeric_limits<unsigned>::max());
	}
	#endif
	}

	class WebAssemblyFixIrreducibleControlFlow final : public MachineFunctionPass {
	StringRef getPassName() const override {
	return "WebAssembly Fix Irreducible Control Flow";
	}

	bool runOnMachineFunction(MachineFunction &MF) override;

	bool processRegion(MachineBasicBlock *Entry, BlockSet &Blocks,
	MachineFunction &MF);

	void makeSingleEntryLoop(const BlockSet &Entries, BlockSet &Blocks,
	MachineFunction &MF, const ReachabilityGraph &Graph);

	public:
	static char ID; // Pass identification, replacement for typeid
	WebAssemblyFixIrreducibleControlFlow() : MachineFunctionPass(ID) {}
	};

	bool WebAssemblyFixIrreducibleControlFlow::processRegion(
	MachineBasicBlock *Entry, BlockSet &Blocks, MachineFunction &MF) {
	bool Changed = false;
	// Remove irreducibility before processing child loops, which may take
	// multiple iterations.
	while (true) {
	ReachabilityGraph Graph(Entry, Blocks);

	bool FoundIrreducibility = false;

	for (auto *LoopEntry : getSortedEntries(Graph.getLoopEntries())) {
	// Find mutual entries - all entries which can reach this one, and
	// are reached by it (that always includes LoopEntry itself). All mutual
	// entries must be in the same SCC, so if we have more than one, then we
	// have irreducible control flow.
	//
	// (Note that we need to sort the entries here, as otherwise the order can
	// matter: being mutual is a symmetric relationship, and each set of
	// mutuals will be handled properly no matter which we see first. However,
	// there can be multiple disjoint sets of mutuals, and which we process
	// first changes the output.)
	//
	// Note that irreducibility may involve inner loops, e.g. imagine A
	// starts one loop, and it has B inside it which starts an inner loop.
	// If we add a branch from all the way on the outside to B, then in a
	// sense B is no longer an "inner" loop, semantically speaking. We will
	// fix that irreducibility by adding a block that dispatches to either
	// either A or B, so B will no longer be an inner loop in our output.
	// (A fancier approach might try to keep it as such.)
	//
	// Note that we still need to recurse into inner loops later, to handle
	// the case where the irreducibility is entirely nested - we would not
	// be able to identify that at this point, since the enclosing loop is
	// a group of blocks all of whom can reach each other. (We'll see the
	// irreducibility after removing branches to the top of that enclosing
	// loop.)
	auto &MutualLoopEntries =
	Graph.getLoopEntriesForSCC(Graph.getSCCId(LoopEntry));

	if (MutualLoopEntries.size() > 1) {
	makeSingleEntryLoop(MutualLoopEntries, Blocks, MF, Graph);
	FoundIrreducibility = true;
	Changed = true;
	break;
	}
	}

	// Only go on to actually process the inner loops when we are done
	// removing irreducible control flow and changing the graph. Modifying
	// the graph as we go is possible, and that might let us avoid looking at
	// the already-fixed loops again if we are careful, but all that is
	// complex and bug-prone. Since irreducible loops are rare, just starting
	// another iteration is best.
	if (FoundIrreducibility) {
	continue;
	}

	for (auto *LoopEntry : Graph.getLoopEntries()) {
	BlockSet InnerBlocks;

	auto EntrySCCId = Graph.getSCCId(LoopEntry);
	for (auto *Block : Blocks) {
	if (EntrySCCId == Graph.getSCCId(Block)) {
	InnerBlocks.insert(Block);
	}
	}

	// Each of these calls to processRegion may change the graph, but are
	// guaranteed not to interfere with each other. The only changes we make
	// to the graph are to add blocks on the way to a loop entry. As the
	// loops are disjoint, that means we may only alter branches that exit
	// another loop, which are ignored when recursing into that other loop
	// anyhow.
	if (processRegion(LoopEntry, InnerBlocks, MF)) {
	Changed = true;
	}
	}

	return Changed;
	}
	}

	// Given a set of entries to a single loop, create a single entry for that
	// loop by creating a dispatch block for them, routing control flow using
	// a helper variable. Also updates Blocks with any new blocks created, so
	// that we properly track all the blocks in the region. But this does not update
	// ReachabilityGraph; this will be updated in the caller of this function as
	// needed.
	void WebAssemblyFixIrreducibleControlFlow::makeSingleEntryLoop(
	const BlockSet &Entries, BlockSet &Blocks, MachineFunction &MF,
	const ReachabilityGraph &Graph) {
	assert(Entries.size() >= 2);

	// Sort the entries to ensure a deterministic build.
	BlockVector SortedEntries = getSortedEntries(Entries);

	#ifndef NDEBUG
	for (auto *Block : SortedEntries)
	assert(Block->getNumber() != -1);
	if (SortedEntries.size() > 1) {
	for (auto I = SortedEntries.begin(), E = SortedEntries.end() - 1; I != E;
	++I) {
	auto ANum = (*I)->getNumber();
	auto BNum = (*(std::next(I)))->getNumber();
	assert(ANum != BNum);
	}
	}
	#endif

	// Create a dispatch block which will contain a jump table to the entries.
	MachineBasicBlock *Dispatch = MF.CreateMachineBasicBlock();
	MF.insert(MF.end(), Dispatch);
	Blocks.insert(Dispatch);

	// Add the jump table.
	const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
	MachineInstrBuilder MIB =
	BuildMI(Dispatch, DebugLoc(), TII.get(WebAssembly::BR_TABLE_I32));

	// Add the register which will be used to tell the jump table which block to
	// jump to.
	MachineRegisterInfo &MRI = MF.getRegInfo();
	Register Reg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
	MIB.addReg(Reg);

	// Compute the indices in the superheader, one for each bad block, and
	// add them as successors.
	DenseMap<MachineBasicBlock *, unsigned> Indices;
	for (auto *Entry : SortedEntries) {
	auto Pair = Indices.try_emplace(Entry);
	assert(Pair.second);

	unsigned Index = MIB.getInstr()->getNumExplicitOperands() - 1;
	Pair.first->second = Index;

	MIB.addMBB(Entry);
	Dispatch->addSuccessor(Entry);
	}

	// Rewrite the problematic successors for every block that wants to reach
	// the bad blocks. For simplicity, we just introduce a new block for every
	// edge we need to rewrite. (Fancier things are possible.)

	BlockVector AllPreds;
	for (auto *Entry : SortedEntries) {
	for (auto *Pred : Entry->predecessors()) {
	if (Pred != Dispatch) {
	AllPreds.push_back(Pred);
	}
	}
	}

	// This set stores predecessors within this loop.
	DenseSet<MachineBasicBlock *> InLoop;
	for (auto *Pred : AllPreds) {
	auto PredSCCId = Graph.getSCCId(Pred);

	for (auto *Entry : Pred->successors()) {
	if (!Entries.count(Entry))
	continue;
	if (Graph.getSCCId(Entry) == PredSCCId) {
	InLoop.insert(Pred);
	break;
	}
	}
	}

	// Record if each entry has a layout predecessor. This map stores
	// <<loop entry, Predecessor is within the loop?>, layout predecessor>
	DenseMap<PointerIntPair<MachineBasicBlock , 1, bool>, MachineBasicBlock >
	EntryToLayoutPred;
	for (auto *Pred : AllPreds) {
	bool PredInLoop = InLoop.count(Pred);
	for (auto *Entry : Pred->successors())
	if (Entries.count(Entry) && Pred->isLayoutSuccessor(Entry))
	EntryToLayoutPred[{Entry, PredInLoop}] = Pred;
	}

	// We need to create at most two routing blocks per entry: one for
	// predecessors outside the loop and one for predecessors inside the loop.
	// This map stores
	// <<loop entry, Predecessor is within the loop?>, routing block>
	DenseMap<PointerIntPair<MachineBasicBlock , 1, bool>, MachineBasicBlock >
	Map;
	for (auto *Pred : AllPreds) {
	bool PredInLoop = InLoop.count(Pred);
	for (auto *Entry : Pred->successors()) {
	if (!Entries.count(Entry) \|\| Map.count({Entry, PredInLoop}))
	continue;
	// If there exists a layout predecessor of this entry and this predecessor
	// is not that, we rather create a routing block after that layout
	// predecessor to save a branch.
	if (auto *OtherPred = EntryToLayoutPred.lookup({Entry, PredInLoop}))
	if (OtherPred != Pred)
	continue;

	// This is a successor we need to rewrite.
	MachineBasicBlock *Routing = MF.CreateMachineBasicBlock();
	MF.insert(Pred->isLayoutSuccessor(Entry)
	? MachineFunction::iterator(Entry)
	: MF.end(),
	Routing);
	Blocks.insert(Routing);

	// Set the jump table's register of the index of the block we wish to
	// jump to, and jump to the jump table.
	BuildMI(Routing, DebugLoc(), TII.get(WebAssembly::CONST_I32), Reg)
	.addImm(Indices[Entry]);
	BuildMI(Routing, DebugLoc(), TII.get(WebAssembly::BR)).addMBB(Dispatch);
	Routing->addSuccessor(Dispatch);
	Map[{Entry, PredInLoop}] = Routing;
	}
	}

	for (auto *Pred : AllPreds) {
	bool PredInLoop = InLoop.count(Pred);
	// Remap the terminator operands and the successor list.
	for (MachineInstr &Term : Pred->terminators())
	for (auto &Op : Term.explicit_uses())
	if (Op.isMBB() && Indices.count(Op.getMBB()))
	Op.setMBB(Map[{Op.getMBB(), PredInLoop}]);

	for (auto *Succ : Pred->successors()) {
	if (!Entries.count(Succ))
	continue;
	auto *Routing = Map[{Succ, PredInLoop}];
	Pred->replaceSuccessor(Succ, Routing);
	}
	}

	// Create a fake default label, because br_table requires one.
	MIB.addMBB(MIB.getInstr()
	->getOperand(MIB.getInstr()->getNumExplicitOperands() - 1)
	.getMBB());
	}

	} // end anonymous namespace

	char WebAssemblyFixIrreducibleControlFlow::ID = 0;
	INITIALIZE_PASS(WebAssemblyFixIrreducibleControlFlow, DEBUG_TYPE,
	"Removes irreducible control flow", false, false)

	FunctionPass *llvm::createWebAssemblyFixIrreducibleControlFlow() {
	return new WebAssemblyFixIrreducibleControlFlow();
	}

	// Test whether the given register has an ARGUMENT def.
	static bool hasArgumentDef(unsigned Reg, const MachineRegisterInfo &MRI) {
	for (const auto &Def : MRI.def_instructions(Reg))
	if (WebAssembly::isArgument(Def.getOpcode()))
	return true;
	return false;
	}

	// Add a register definition with IMPLICIT_DEFs for every register to cover for
	// register uses that don't have defs in every possible path.
	// TODO: This is fairly heavy-handed; find a better approach.
	static void addImplicitDefs(MachineFunction &MF) {
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
	MachineBasicBlock &Entry = *MF.begin();
	for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) {
	Register Reg = Register::index2VirtReg(I);

	// Skip unused registers.
	if (MRI.use_nodbg_empty(Reg))
	continue;

	// Skip registers that have an ARGUMENT definition.
	if (hasArgumentDef(Reg, MRI))
	continue;

	BuildMI(Entry, Entry.begin(), DebugLoc(),
	TII.get(WebAssembly::IMPLICIT_DEF), Reg);
	}

	// Move ARGUMENT_* instructions to the top of the entry block, so that their
	// liveness reflects the fact that these really are live-in values.
	for (MachineInstr &MI : llvm::make_early_inc_range(Entry)) {
	if (WebAssembly::isArgument(MI.getOpcode())) {
	MI.removeFromParent();
	Entry.insert(Entry.begin(), &MI);
	}
	}
	}

	bool WebAssemblyFixIrreducibleControlFlow::runOnMachineFunction(
	MachineFunction &MF) {
	LLVM_DEBUG(dbgs() << "******** Fixing Irreducible Control Flow ********\n"
	"********** Function: "
	<< MF.getName() << '\n');

	// Start the recursive process on the entire function body.
	BlockSet AllBlocks;
	for (auto &MBB : MF) {
	AllBlocks.insert(&MBB);
	}

	if (LLVM_UNLIKELY(processRegion(&*MF.begin(), AllBlocks, MF))) {
	// We rewrote part of the function; recompute relevant things.
	MF.RenumberBlocks();
	// Now we've inserted dispatch blocks, some register uses can have incoming
	// paths without a def. For example, before this pass register %a was
	// defined in BB1 and used in BB2, and there was only one path from BB1 and
	// BB2. But if this pass inserts a dispatch block having multiple
	// predecessors between the two BBs, now there are paths to BB2 without
	// visiting BB1, and %a's use in BB2 is not dominated by its def. Adding
	// IMPLICIT_DEFs to all regs is one simple way to fix it.
	addImplicitDefs(MF);
	return true;
	}

	return false;
	}