llvm/lib/Transforms/Vectorize/VPlanSLP.cpp - llvm-project - Git at Google

 //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 /// This file implements SLP analysis based on VPlan. The analysis is based on
 /// the ideas described in
 ///
 ///   Look-ahead SLP: auto-vectorization in the presence of commutative
 ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
 ///   Luís F. W. Góes
 ///
 //===----------------------------------------------------------------------===//

 #include "VPlan.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <algorithm>
 #include <cassert>
 #include <iterator>
 #include <utility>

 using namespace llvm;

 #define DEBUG_TYPE "vplan-slp"

 // Number of levels to look ahead when re-ordering multi node operands.
 static unsigned LookaheadMaxDepth = 5;

 VPInstruction *VPlanSlp::markFailed() {
   // FIXME: Currently this is used to signal we hit instructions we cannot
   //        trivially SLP'ize.
   CompletelySLP = false;
   return nullptr;
 }

 void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
   if (all_of(Operands, [](VPValue *V) {
         return cast<VPInstruction>(V)->getUnderlyingInstr();
       })) {
     unsigned BundleSize = 0;
     for (VPValue *V : Operands) {
       Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
       assert(!T->isVectorTy() && "Only scalar types supported for now");
       BundleSize += T->getScalarSizeInBits();
     }
     WidestBundleBits = std::max(WidestBundleBits, BundleSize);
   }

   auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
   assert(Res.second &&
          "Already created a combined instruction for the operand bundle");
   (void)Res;
 }

 bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
   // Currently we only support VPInstructions.
   if (!all_of(Operands, [](VPValue *Op) {
         return Op && isa<VPInstruction>(Op) &&
                cast<VPInstruction>(Op)->getUnderlyingInstr();
       })) {
     LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
     return false;
   }

   // Check if opcodes and type width agree for all instructions in the bundle.
   // FIXME: Differing widths/opcodes can be handled by inserting additional
   //        instructions.
   // FIXME: Deal with non-primitive types.
   const Instruction *OriginalInstr =
       cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
   unsigned Opcode = OriginalInstr->getOpcode();
   unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
   if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
         const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
         return I->getOpcode() == Opcode &&
                I->getType()->getPrimitiveSizeInBits() == Width;
       })) {
     LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
     return false;
   }

   // For now, all operands must be defined in the same BB.
   if (any_of(Operands, [this](VPValue *Op) {
         return cast<VPInstruction>(Op)->getParent() != &this->BB;
       })) {
     LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
     return false;
   }

   if (any_of(Operands,
              [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
     LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
     return false;
   }

   // For loads, check that there are no instructions writing to memory in
   // between them.
   // TODO: we only have to forbid instructions writing to memory that could
   //       interfere with any of the loads in the bundle
   if (Opcode == Instruction::Load) {
     unsigned LoadsSeen = 0;
     VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
     for (auto &I : *Parent) {
       auto *VPI = dyn_cast<VPInstruction>(&I);
       if (!VPI)
         break;
       if (VPI->getOpcode() == Instruction::Load &&
           llvm::is_contained(Operands, VPI))
         LoadsSeen++;

       if (LoadsSeen == Operands.size())
         break;
       if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
         LLVM_DEBUG(
             dbgs() << "VPSLP: instruction modifying memory between loads\n");
         return false;
       }
     }

     if (!all_of(Operands, [](VPValue *Op) {
           return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
               ->isSimple();
         })) {
       LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
       return false;
     }
   }

   if (Opcode == Instruction::Store)
     if (!all_of(Operands, [](VPValue *Op) {
           return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
               ->isSimple();
         })) {
       LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
       return false;
     }

   return true;
 }

 static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
                                              unsigned OperandIndex) {
   SmallVector<VPValue *, 4> Operands;
   for (VPValue *V : Values) {
     // Currently we only support VPInstructions.
     auto *U = cast<VPInstruction>(V);
     Operands.push_back(U->getOperand(OperandIndex));
   }
   return Operands;
 }

 static bool areCommutative(ArrayRef<VPValue *> Values) {
   return Instruction::isCommutative(
       cast<VPInstruction>(Values[0])->getOpcode());
 }

 static SmallVector<SmallVector<VPValue *, 4>, 4>
 getOperands(ArrayRef<VPValue *> Values) {
   SmallVector<SmallVector<VPValue *, 4>, 4> Result;
   auto *VPI = cast<VPInstruction>(Values[0]);

   switch (VPI->getOpcode()) {
   case Instruction::Load:
     llvm_unreachable("Loads terminate a tree, no need to get operands");
   case Instruction::Store:
     Result.push_back(getOperands(Values, 0));
     break;
   default:
     for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
       Result.push_back(getOperands(Values, I));
     break;
   }

   return Result;
 }

 /// Returns the opcode of Values or ~0 if they do not all agree.
 static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
   unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
   if (any_of(Values, [Opcode](VPValue *V) {
         return cast<VPInstruction>(V)->getOpcode() != Opcode;
       }))
     return None;
   return {Opcode};
 }

 /// Returns true if A and B access sequential memory if they are loads or
 /// stores or if they have identical opcodes otherwise.
 static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
                                   VPInterleavedAccessInfo &IAI) {
   if (A->getOpcode() != B->getOpcode())
     return false;

   if (A->getOpcode() != Instruction::Load &&
       A->getOpcode() != Instruction::Store)
     return true;
   auto *GA = IAI.getInterleaveGroup(A);
   auto *GB = IAI.getInterleaveGroup(B);

   return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
 }

 /// Implements getLAScore from Listing 7 in the paper.
 /// Traverses and compares operands of V1 and V2 to MaxLevel.
 static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
                            VPInterleavedAccessInfo &IAI) {
   auto *I1 = dyn_cast<VPInstruction>(V1);
   auto *I2 = dyn_cast<VPInstruction>(V2);
   // Currently we only support VPInstructions.
   if (!I1 || !I2)
     return 0;

   if (MaxLevel == 0)
     return (unsigned)areConsecutiveOrMatch(I1, I2, IAI);

   unsigned Score = 0;
   for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I)
     for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J)
       Score +=
           getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI);
   return Score;
 }

 std::pair<VPlanSlp::OpMode, VPValue *>
 VPlanSlp::getBest(OpMode Mode, VPValue *Last,
                   SmallPtrSetImpl<VPValue *> &Candidates,
                   VPInterleavedAccessInfo &IAI) {
   assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
          "Currently we only handle load and commutative opcodes");
   LLVM_DEBUG(dbgs() << "      getBest\n");

   SmallVector<VPValue *, 4> BestCandidates;
   LLVM_DEBUG(dbgs() << "        Candidates  for "
                     << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
   for (auto *Candidate : Candidates) {
     auto *LastI = cast<VPInstruction>(Last);
     auto *CandidateI = cast<VPInstruction>(Candidate);
     if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
       LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
                         << " ");
       BestCandidates.push_back(Candidate);
     }
   }
   LLVM_DEBUG(dbgs() << "\n");

   if (BestCandidates.empty())
     return {OpMode::Failed, nullptr};

   if (BestCandidates.size() == 1)
     return {Mode, BestCandidates[0]};

   VPValue *Best = nullptr;
   unsigned BestScore = 0;
   for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
     unsigned PrevScore = ~0u;
     bool AllSame = true;

     // FIXME: Avoid visiting the same operands multiple times.
     for (auto *Candidate : BestCandidates) {
       unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
       if (PrevScore == ~0u)
         PrevScore = Score;
       if (PrevScore != Score)
         AllSame = false;
       PrevScore = Score;

       if (Score > BestScore) {
         BestScore = Score;
         Best = Candidate;
       }
     }
     if (!AllSame)
       break;
   }
   LLVM_DEBUG(dbgs() << "Found best "
                     << *cast<VPInstruction>(Best)->getUnderlyingInstr()
                     << "\n");
   Candidates.erase(Best);

   return {Mode, Best};
 }

 SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
   SmallVector<MultiNodeOpTy, 4> FinalOrder;
   SmallVector<OpMode, 4> Mode;
   FinalOrder.reserve(MultiNodeOps.size());
   Mode.reserve(MultiNodeOps.size());

   LLVM_DEBUG(dbgs() << "Reordering multinode\n");

   for (auto &Operands : MultiNodeOps) {
     FinalOrder.push_back({Operands.first, {Operands.second[0]}});
     if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
         Instruction::Load)
       Mode.push_back(OpMode::Load);
     else
       Mode.push_back(OpMode::Opcode);
   }

   for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
     LLVM_DEBUG(dbgs() << "  Finding best value for lane " << Lane << "\n");
     SmallPtrSet<VPValue *, 4> Candidates;
     LLVM_DEBUG(dbgs() << "  Candidates  ");
     for (auto Ops : MultiNodeOps) {
       LLVM_DEBUG(
           dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
                  << " ");
       Candidates.insert(Ops.second[Lane]);
     }
     LLVM_DEBUG(dbgs() << "\n");

     for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
       LLVM_DEBUG(dbgs() << "  Checking " << Op << "\n");
       if (Mode[Op] == OpMode::Failed)
         continue;

       VPValue *Last = FinalOrder[Op].second[Lane - 1];
       std::pair<OpMode, VPValue *> Res =
           getBest(Mode[Op], Last, Candidates, IAI);
       if (Res.second)
         FinalOrder[Op].second.push_back(Res.second);
       else
         // TODO: handle this case
         FinalOrder[Op].second.push_back(markFailed());
     }
   }

   return FinalOrder;
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
   dbgs() << " Ops: ";
   for (auto Op : Values) {
     if (auto *VPInstr = cast_or_null<VPInstruction>(Op))
       if (auto *Instr = VPInstr->getUnderlyingInstr()) {
         dbgs() << *Instr << " | ";
         continue;
       }
     dbgs() << " nullptr | ";
   }
   dbgs() << "\n";
 }
 #endif

 VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
   assert(!Values.empty() && "Need some operands!");

   // If we already visited this instruction bundle, re-use the existing node
   auto I = BundleToCombined.find(to_vector<4>(Values));
   if (I != BundleToCombined.end()) {
 #ifndef NDEBUG
     // Check that the resulting graph is a tree. If we re-use a node, this means
     // its values have multiple users. We only allow this, if all users of each
     // value are the same instruction.
     for (auto *V : Values) {
       auto UI = V->user_begin();
       auto *FirstUser = *UI++;
       while (UI != V->user_end()) {
         assert(*UI == FirstUser && "Currently we only support SLP trees.");
         UI++;
       }
     }
 #endif
     return I->second;
   }

   // Dump inputs
   LLVM_DEBUG({
     dbgs() << "buildGraph: ";
     dumpBundle(Values);
   });

   if (!areVectorizable(Values))
     return markFailed();

   assert(getOpcode(Values) && "Opcodes for all values must match");
   unsigned ValuesOpcode = getOpcode(Values).getValue();

   SmallVector<VPValue *, 4> CombinedOperands;
   if (areCommutative(Values)) {
     bool MultiNodeRoot = !MultiNodeActive;
     MultiNodeActive = true;
     for (auto &Operands : getOperands(Values)) {
       LLVM_DEBUG({
         dbgs() << "  Visiting Commutative";
         dumpBundle(Operands);
       });

       auto OperandsOpcode = getOpcode(Operands);
       if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
         LLVM_DEBUG(dbgs() << "    Same opcode, continue building\n");
         CombinedOperands.push_back(buildGraph(Operands));
       } else {
         LLVM_DEBUG(dbgs() << "    Adding multinode Ops\n");
         // Create dummy VPInstruction, which will we replace later by the
         // re-ordered operand.
         VPInstruction *Op = new VPInstruction(0, {});
         CombinedOperands.push_back(Op);
         MultiNodeOps.emplace_back(Op, Operands);
       }
     }

     if (MultiNodeRoot) {
       LLVM_DEBUG(dbgs() << "Reorder \n");
       MultiNodeActive = false;

       auto FinalOrder = reorderMultiNodeOps();

       MultiNodeOps.clear();
       for (auto &Ops : FinalOrder) {
         VPInstruction *NewOp = buildGraph(Ops.second);
         Ops.first->replaceAllUsesWith(NewOp);
         for (unsigned i = 0; i < CombinedOperands.size(); i++)
           if (CombinedOperands[i] == Ops.first)
             CombinedOperands[i] = NewOp;
         delete Ops.first;
         Ops.first = NewOp;
       }
       LLVM_DEBUG(dbgs() << "Found final order\n");
     }
   } else {
     LLVM_DEBUG(dbgs() << "  NonCommuntative\n");
     if (ValuesOpcode == Instruction::Load)
       for (VPValue *V : Values)
         CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
     else
       for (auto &Operands : getOperands(Values))
         CombinedOperands.push_back(buildGraph(Operands));
   }

   unsigned Opcode;
   switch (ValuesOpcode) {
   case Instruction::Load:
     Opcode = VPInstruction::SLPLoad;
     break;
   case Instruction::Store:
     Opcode = VPInstruction::SLPStore;
     break;
   default:
     Opcode = ValuesOpcode;
     break;
   }

   if (!CompletelySLP)
     return markFailed();

   assert(CombinedOperands.size() > 0 && "Need more some operands");
   auto *VPI = new VPInstruction(Opcode, CombinedOperands);
   VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());

   LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "
                     << *cast<VPInstruction>(Values[0]) << "\n");
   addCombined(Values, VPI);
   return VPI;
 }
	//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	/// This file implements SLP analysis based on VPlan. The analysis is based on
	/// the ideas described in
	///
	/// Look-ahead SLP: auto-vectorization in the presence of commutative
	/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
	/// Luís F. W. Góes
	///
	//===----------------------------------------------------------------------===//

	#include "VPlan.h"
	#include "llvm/ADT/DepthFirstIterator.h"
	#include "llvm/ADT/PostOrderIterator.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/Analysis/LoopInfo.h"
	#include "llvm/Analysis/VectorUtils.h"
	#include "llvm/IR/BasicBlock.h"
	#include "llvm/IR/CFG.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/InstrTypes.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Value.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/GraphWriter.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
	#include <algorithm>
	#include <cassert>
	#include <iterator>
	#include <utility>

	using namespace llvm;

	#define DEBUG_TYPE "vplan-slp"

	// Number of levels to look ahead when re-ordering multi node operands.
	static unsigned LookaheadMaxDepth = 5;

	VPInstruction *VPlanSlp::markFailed() {
	// FIXME: Currently this is used to signal we hit instructions we cannot
	// trivially SLP'ize.
	CompletelySLP = false;
	return nullptr;
	}

	void VPlanSlp::addCombined(ArrayRef<VPValue > Operands, VPInstruction New) {
	if (all_of(Operands, [](VPValue *V) {
	return cast<VPInstruction>(V)->getUnderlyingInstr();
	})) {
	unsigned BundleSize = 0;
	for (VPValue *V : Operands) {
	Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
	assert(!T->isVectorTy() && "Only scalar types supported for now");
	BundleSize += T->getScalarSizeInBits();
	}
	WidestBundleBits = std::max(WidestBundleBits, BundleSize);
	}

	auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
	assert(Res.second &&
	"Already created a combined instruction for the operand bundle");
	(void)Res;
	}

	bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
	// Currently we only support VPInstructions.
	if (!all_of(Operands, [](VPValue *Op) {
	return Op && isa<VPInstruction>(Op) &&
	cast<VPInstruction>(Op)->getUnderlyingInstr();
	})) {
	LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
	return false;
	}

	// Check if opcodes and type width agree for all instructions in the bundle.
	// FIXME: Differing widths/opcodes can be handled by inserting additional
	// instructions.
	// FIXME: Deal with non-primitive types.
	const Instruction *OriginalInstr =
	cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
	unsigned Opcode = OriginalInstr->getOpcode();
	unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
	if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
	const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
	return I->getOpcode() == Opcode &&
	I->getType()->getPrimitiveSizeInBits() == Width;
	})) {
	LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
	return false;
	}

	// For now, all operands must be defined in the same BB.
	if (any_of(Operands, [this](VPValue *Op) {
	return cast<VPInstruction>(Op)->getParent() != &this->BB;
	})) {
	LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
	return false;
	}

	if (any_of(Operands,
	[](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
	LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
	return false;
	}

	// For loads, check that there are no instructions writing to memory in
	// between them.
	// TODO: we only have to forbid instructions writing to memory that could
	// interfere with any of the loads in the bundle
	if (Opcode == Instruction::Load) {
	unsigned LoadsSeen = 0;
	VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
	for (auto &I : *Parent) {
	auto *VPI = dyn_cast<VPInstruction>(&I);
	if (!VPI)
	break;
	if (VPI->getOpcode() == Instruction::Load &&
	llvm::is_contained(Operands, VPI))
	LoadsSeen++;

	if (LoadsSeen == Operands.size())
	break;
	if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
	LLVM_DEBUG(
	dbgs() << "VPSLP: instruction modifying memory between loads\n");
	return false;
	}
	}

	if (!all_of(Operands, [](VPValue *Op) {
	return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
	->isSimple();
	})) {
	LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
	return false;
	}
	}

	if (Opcode == Instruction::Store)
	if (!all_of(Operands, [](VPValue *Op) {
	return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
	->isSimple();
	})) {
	LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
	return false;
	}

	return true;
	}

	static SmallVector<VPValue , 4> getOperands(ArrayRef<VPValue > Values,
	unsigned OperandIndex) {
	SmallVector<VPValue *, 4> Operands;
	for (VPValue *V : Values) {
	// Currently we only support VPInstructions.
	auto *U = cast<VPInstruction>(V);
	Operands.push_back(U->getOperand(OperandIndex));
	}
	return Operands;
	}

	static bool areCommutative(ArrayRef<VPValue *> Values) {
	return Instruction::isCommutative(
	cast<VPInstruction>(Values[0])->getOpcode());
	}

	static SmallVector<SmallVector<VPValue *, 4>, 4>
	getOperands(ArrayRef<VPValue *> Values) {
	SmallVector<SmallVector<VPValue *, 4>, 4> Result;
	auto *VPI = cast<VPInstruction>(Values[0]);

	switch (VPI->getOpcode()) {
	case Instruction::Load:
	llvm_unreachable("Loads terminate a tree, no need to get operands");
	case Instruction::Store:
	Result.push_back(getOperands(Values, 0));
	break;
	default:
	for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
	Result.push_back(getOperands(Values, I));
	break;
	}

	return Result;
	}

	/// Returns the opcode of Values or ~0 if they do not all agree.
	static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
	unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
	if (any_of(Values, [Opcode](VPValue *V) {
	return cast<VPInstruction>(V)->getOpcode() != Opcode;
	}))
	return None;
	return {Opcode};
	}

	/// Returns true if A and B access sequential memory if they are loads or
	/// stores or if they have identical opcodes otherwise.
	static bool areConsecutiveOrMatch(VPInstruction A, VPInstruction B,
	VPInterleavedAccessInfo &IAI) {
	if (A->getOpcode() != B->getOpcode())
	return false;

	if (A->getOpcode() != Instruction::Load &&
	A->getOpcode() != Instruction::Store)
	return true;
	auto *GA = IAI.getInterleaveGroup(A);
	auto *GB = IAI.getInterleaveGroup(B);

	return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
	}

	/// Implements getLAScore from Listing 7 in the paper.
	/// Traverses and compares operands of V1 and V2 to MaxLevel.
	static unsigned getLAScore(VPValue V1, VPValue V2, unsigned MaxLevel,
	VPInterleavedAccessInfo &IAI) {
	auto *I1 = dyn_cast<VPInstruction>(V1);
	auto *I2 = dyn_cast<VPInstruction>(V2);
	// Currently we only support VPInstructions.
	if (!I1 \|\| !I2)
	return 0;

	if (MaxLevel == 0)
	return (unsigned)areConsecutiveOrMatch(I1, I2, IAI);

	unsigned Score = 0;
	for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I)
	for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J)
	Score +=
	getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI);
	return Score;
	}

	std::pair<VPlanSlp::OpMode, VPValue *>
	VPlanSlp::getBest(OpMode Mode, VPValue *Last,
	SmallPtrSetImpl<VPValue *> &Candidates,
	VPInterleavedAccessInfo &IAI) {
	assert((Mode == OpMode::Load \|\| Mode == OpMode::Opcode) &&
	"Currently we only handle load and commutative opcodes");
	LLVM_DEBUG(dbgs() << " getBest\n");

	SmallVector<VPValue *, 4> BestCandidates;
	LLVM_DEBUG(dbgs() << " Candidates for "
	<< *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
	for (auto *Candidate : Candidates) {
	auto *LastI = cast<VPInstruction>(Last);
	auto *CandidateI = cast<VPInstruction>(Candidate);
	if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
	LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
	<< " ");
	BestCandidates.push_back(Candidate);
	}
	}
	LLVM_DEBUG(dbgs() << "\n");

	if (BestCandidates.empty())
	return {OpMode::Failed, nullptr};

	if (BestCandidates.size() == 1)
	return {Mode, BestCandidates[0]};

	VPValue *Best = nullptr;
	unsigned BestScore = 0;
	for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
	unsigned PrevScore = ~0u;
	bool AllSame = true;

	// FIXME: Avoid visiting the same operands multiple times.
	for (auto *Candidate : BestCandidates) {
	unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
	if (PrevScore == ~0u)
	PrevScore = Score;
	if (PrevScore != Score)
	AllSame = false;
	PrevScore = Score;

	if (Score > BestScore) {
	BestScore = Score;
	Best = Candidate;
	}
	}
	if (!AllSame)
	break;
	}
	LLVM_DEBUG(dbgs() << "Found best "
	<< *cast<VPInstruction>(Best)->getUnderlyingInstr()
	<< "\n");
	Candidates.erase(Best);

	return {Mode, Best};
	}

	SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
	SmallVector<MultiNodeOpTy, 4> FinalOrder;
	SmallVector<OpMode, 4> Mode;
	FinalOrder.reserve(MultiNodeOps.size());
	Mode.reserve(MultiNodeOps.size());

	LLVM_DEBUG(dbgs() << "Reordering multinode\n");

	for (auto &Operands : MultiNodeOps) {
	FinalOrder.push_back({Operands.first, {Operands.second[0]}});
	if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
	Instruction::Load)
	Mode.push_back(OpMode::Load);
	else
	Mode.push_back(OpMode::Opcode);
	}

	for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
	LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n");
	SmallPtrSet<VPValue *, 4> Candidates;
	LLVM_DEBUG(dbgs() << " Candidates ");
	for (auto Ops : MultiNodeOps) {
	LLVM_DEBUG(
	dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
	<< " ");
	Candidates.insert(Ops.second[Lane]);
	}
	LLVM_DEBUG(dbgs() << "\n");

	for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
	LLVM_DEBUG(dbgs() << " Checking " << Op << "\n");
	if (Mode[Op] == OpMode::Failed)
	continue;

	VPValue *Last = FinalOrder[Op].second[Lane - 1];
	std::pair<OpMode, VPValue *> Res =
	getBest(Mode[Op], Last, Candidates, IAI);
	if (Res.second)
	FinalOrder[Op].second.push_back(Res.second);
	else
	// TODO: handle this case
	FinalOrder[Op].second.push_back(markFailed());
	}
	}

	return FinalOrder;
	}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
	dbgs() << " Ops: ";
	for (auto Op : Values) {
	if (auto *VPInstr = cast_or_null<VPInstruction>(Op))
	if (auto *Instr = VPInstr->getUnderlyingInstr()) {
	dbgs() << *Instr << " \| ";
	continue;
	}
	dbgs() << " nullptr \| ";
	}
	dbgs() << "\n";
	}
	#endif

	VPInstruction VPlanSlp::buildGraph(ArrayRef<VPValue > Values) {
	assert(!Values.empty() && "Need some operands!");

	// If we already visited this instruction bundle, re-use the existing node
	auto I = BundleToCombined.find(to_vector<4>(Values));
	if (I != BundleToCombined.end()) {
	#ifndef NDEBUG
	// Check that the resulting graph is a tree. If we re-use a node, this means
	// its values have multiple users. We only allow this, if all users of each
	// value are the same instruction.
	for (auto *V : Values) {
	auto UI = V->user_begin();
	auto FirstUser = UI++;
	while (UI != V->user_end()) {
	assert(*UI == FirstUser && "Currently we only support SLP trees.");
	UI++;
	}
	}
	#endif
	return I->second;
	}

	// Dump inputs
	LLVM_DEBUG({
	dbgs() << "buildGraph: ";
	dumpBundle(Values);
	});

	if (!areVectorizable(Values))
	return markFailed();

	assert(getOpcode(Values) && "Opcodes for all values must match");
	unsigned ValuesOpcode = getOpcode(Values).getValue();

	SmallVector<VPValue *, 4> CombinedOperands;
	if (areCommutative(Values)) {
	bool MultiNodeRoot = !MultiNodeActive;
	MultiNodeActive = true;
	for (auto &Operands : getOperands(Values)) {
	LLVM_DEBUG({
	dbgs() << " Visiting Commutative";
	dumpBundle(Operands);
	});

	auto OperandsOpcode = getOpcode(Operands);
	if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
	LLVM_DEBUG(dbgs() << " Same opcode, continue building\n");
	CombinedOperands.push_back(buildGraph(Operands));
	} else {
	LLVM_DEBUG(dbgs() << " Adding multinode Ops\n");
	// Create dummy VPInstruction, which will we replace later by the
	// re-ordered operand.
	VPInstruction *Op = new VPInstruction(0, {});
	CombinedOperands.push_back(Op);
	MultiNodeOps.emplace_back(Op, Operands);
	}
	}

	if (MultiNodeRoot) {
	LLVM_DEBUG(dbgs() << "Reorder \n");
	MultiNodeActive = false;

	auto FinalOrder = reorderMultiNodeOps();

	MultiNodeOps.clear();
	for (auto &Ops : FinalOrder) {
	VPInstruction *NewOp = buildGraph(Ops.second);
	Ops.first->replaceAllUsesWith(NewOp);
	for (unsigned i = 0; i < CombinedOperands.size(); i++)
	if (CombinedOperands[i] == Ops.first)
	CombinedOperands[i] = NewOp;
	delete Ops.first;
	Ops.first = NewOp;
	}
	LLVM_DEBUG(dbgs() << "Found final order\n");
	}
	} else {
	LLVM_DEBUG(dbgs() << " NonCommuntative\n");
	if (ValuesOpcode == Instruction::Load)
	for (VPValue *V : Values)
	CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
	else
	for (auto &Operands : getOperands(Values))
	CombinedOperands.push_back(buildGraph(Operands));
	}

	unsigned Opcode;
	switch (ValuesOpcode) {
	case Instruction::Load:
	Opcode = VPInstruction::SLPLoad;
	break;
	case Instruction::Store:
	Opcode = VPInstruction::SLPStore;
	break;
	default:
	Opcode = ValuesOpcode;
	break;
	}

	if (!CompletelySLP)
	return markFailed();

	assert(CombinedOperands.size() > 0 && "Need more some operands");
	auto *VPI = new VPInstruction(Opcode, CombinedOperands);
	VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());

	LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "
	<< *cast<VPInstruction>(Values[0]) << "\n");
	addCombined(Values, VPI);
	return VPI;
	}