llvm/lib/Transforms/Utils/IRNormalizer.cpp - llvm-project - Git at Google

 //===--------------- IRNormalizer.cpp - IR Normalizer ---------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 /// \file
 /// This file implements the IRNormalizer class which aims to transform LLVM
 /// Modules into a normal form by reordering and renaming instructions while
 /// preserving the same semantics. The normalizer makes it easier to spot
 /// semantic differences while diffing two modules which have undergone
 /// different passes.
 ///
 //===----------------------------------------------------------------------===//

 #include "llvm/Transforms/Utils/IRNormalizer.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils.h"
 #include <algorithm>
 #include <stack>

 #define DEBUG_TYPE "normalize"

 using namespace llvm;

 namespace {
 /// IRNormalizer aims to transform LLVM IR into normal form.
 class IRNormalizer {
 public:
   bool runOnFunction(Function &F);

   IRNormalizer(IRNormalizerOptions Options) : Options(Options) {}

 private:
   const IRNormalizerOptions Options;

   // Random constant for hashing, so the state isn't zero.
   const uint64_t MagicHashConstant = 0x6acaa36bef8325c5ULL;
   DenseSet<const Instruction *> NamedInstructions;

   SmallVector<Instruction *, 16> Outputs;

   /// \name Naming.
   /// @{
   void nameFunctionArguments(Function &F) const;
   void nameBasicBlocks(Function &F) const;
   void nameInstruction(Instruction *I);
   void nameAsInitialInstruction(Instruction *I) const;
   void nameAsRegularInstruction(Instruction *I);
   void foldInstructionName(Instruction *I) const;
   /// @}

   /// \name Reordering.
   /// @{
   void reorderInstructions(Function &F) const;
   void reorderDefinition(Instruction *Definition,
                          std::stack<Instruction *> &TopologicalSort,
                          SmallPtrSet<const Instruction *, 32> &Visited) const;
   void reorderInstructionOperandsByNames(Instruction *I) const;
   void reorderPHIIncomingValues(PHINode *Phi) const;
   /// @}

   /// \name Utility methods.
   /// @{
   template <typename T>
   void sortCommutativeOperands(Instruction *I, T &Operands) const;
   SmallVector<Instruction *, 16> collectOutputInstructions(Function &F) const;
   bool isOutput(const Instruction *I) const;
   bool isInitialInstruction(const Instruction *I) const;
   bool hasOnlyImmediateOperands(const Instruction *I) const;
   SetVector<int>
   getOutputFootprint(Instruction *I,
                      SmallPtrSet<const Instruction *, 32> &Visited) const;
   /// @}
 };
 } // namespace

 /// Entry method to the IRNormalizer.
 ///
 /// \param F Function to normalize.
 bool IRNormalizer::runOnFunction(Function &F) {
   nameFunctionArguments(F);
   nameBasicBlocks(F);

   Outputs = collectOutputInstructions(F);

   if (!Options.PreserveOrder)
     reorderInstructions(F);

   // TODO: Reorder basic blocks via a topological sort.

   for (auto &I : Outputs)
     nameInstruction(I);

   for (auto &I : instructions(F)) {
     if (!Options.PreserveOrder) {
       if (Options.ReorderOperands)
         reorderInstructionOperandsByNames(&I);

       if (auto *Phi = dyn_cast<PHINode>(&I))
         reorderPHIIncomingValues(Phi);
     }
     foldInstructionName(&I);
   }

   return true;
 }

 /// Numbers arguments.
 ///
 /// \param F Function whose arguments will be renamed.
 void IRNormalizer::nameFunctionArguments(Function &F) const {
   int ArgumentCounter = 0;
   for (auto &A : F.args()) {
     if (Options.RenameAll || A.getName().empty()) {
       A.setName("a" + Twine(ArgumentCounter));
       ArgumentCounter += 1;
     }
   }
 }

 /// Names basic blocks using a generated hash for each basic block in
 /// a function considering the opcode and the order of output instructions.
 ///
 /// \param F Function containing basic blocks to rename.
 void IRNormalizer::nameBasicBlocks(Function &F) const {
   for (auto &B : F) {
     // Initialize to a magic constant, so the state isn't zero.
     uint64_t Hash = MagicHashConstant;

     // Hash considering output instruction opcodes.
     for (auto &I : B)
       if (isOutput(&I))
         Hash = hashing::detail::hash_16_bytes(Hash, I.getOpcode());

     if (Options.RenameAll || B.getName().empty()) {
       // Name basic block. Substring hash to make diffs more readable.
       B.setName("bb" + std::to_string(Hash).substr(0, 5));
     }
   }
 }

 /// Names instructions graphically (recursive) in accordance with the
 /// def-use tree, starting from the initial instructions (defs), finishing at
 /// the output (top-most user) instructions (depth-first).
 ///
 /// \param I Instruction to be renamed.
 void IRNormalizer::nameInstruction(Instruction *I) {
   // Ensure instructions are not renamed. This is done
   // to prevent situation where instructions are used
   // before their definition (in phi nodes)
   if (NamedInstructions.contains(I))
     return;
   NamedInstructions.insert(I);
   if (isInitialInstruction(I)) {
     nameAsInitialInstruction(I);
   } else {
     // This must be a regular instruction.
     nameAsRegularInstruction(I);
   }
 }

 template <typename T>
 void IRNormalizer::sortCommutativeOperands(Instruction *I, T &Operands) const {
   if (!(I->isCommutative() && Operands.size() >= 2))
     return;
   auto CommutativeEnd = Operands.begin();
   std::advance(CommutativeEnd, 2);
   llvm::sort(Operands.begin(), CommutativeEnd);
 }

 /// Names instruction following the scheme:
 /// vl00000Callee(Operands)
 ///
 /// Where 00000 is a hash calculated considering instruction's opcode and output
 /// footprint. Callee's name is only included when instruction's type is
 /// CallInst. In cases where instruction is commutative, operands list is also
 /// sorted.
 ///
 /// Renames instruction only when RenameAll flag is raised or instruction is
 /// unnamed.
 ///
 /// \see getOutputFootprint()
 /// \param I Instruction to be renamed.
 void IRNormalizer::nameAsInitialInstruction(Instruction *I) const {
   if (I->getType()->isVoidTy())
     return;
   if (!(I->getName().empty() || Options.RenameAll))
     return;
   LLVM_DEBUG(dbgs() << "Naming initial instruction: " << *I << "\n");

   // Instruction operands for further sorting.
   SmallVector<SmallString<64>, 4> Operands;

   // Collect operands.
   for (auto &Op : I->operands()) {
     if (!isa<Function>(Op)) {
       std::string TextRepresentation;
       raw_string_ostream Stream(TextRepresentation);
       Op->printAsOperand(Stream, false);
       Operands.push_back(StringRef(Stream.str()));
     }
   }

   sortCommutativeOperands(I, Operands);

   // Initialize to a magic constant, so the state isn't zero.
   uint64_t Hash = MagicHashConstant;

   // Consider instruction's opcode in the hash.
   Hash = hashing::detail::hash_16_bytes(Hash, I->getOpcode());

   SmallPtrSet<const Instruction *, 32> Visited;
   // Get output footprint for I.
   SetVector<int> OutputFootprint = getOutputFootprint(I, Visited);

   // Consider output footprint in the hash.
   for (const int &Output : OutputFootprint)
     Hash = hashing::detail::hash_16_bytes(Hash, Output);

   // Base instruction name.
   SmallString<256> Name;
   Name.append("vl" + std::to_string(Hash).substr(0, 5));

   // In case of CallInst, consider callee in the instruction name.
   if (const auto *CI = dyn_cast<CallInst>(I)) {
     Function *F = CI->getCalledFunction();

     if (F != nullptr)
       Name.append(F->getName());
   }

   Name.append("(");
   for (size_t i = 0; i < Operands.size(); ++i) {
     Name.append(Operands[i]);

     if (i < Operands.size() - 1)
       Name.append(", ");
   }
   Name.append(")");

   I->setName(Name);
 }

 /// Names instruction following the scheme:
 /// op00000Callee(Operands)
 ///
 /// Where 00000 is a hash calculated considering instruction's opcode, its
 /// operands' opcodes and order. Callee's name is only included when
 /// instruction's type is CallInst. In cases where instruction is commutative,
 /// operand list is also sorted.
 ///
 /// Names instructions recursively in accordance with the def-use tree,
 /// starting from the initial instructions (defs), finishing at
 /// the output (top-most user) instructions (depth-first).
 ///
 /// Renames instruction only when RenameAll flag is raised or instruction is
 /// unnamed.
 ///
 /// \see getOutputFootprint()
 /// \param I Instruction to be renamed.
 void IRNormalizer::nameAsRegularInstruction(Instruction *I) {
   LLVM_DEBUG(dbgs() << "Naming regular instruction: " << *I << "\n");

   // Instruction operands for further sorting.
   SmallVector<SmallString<128>, 4> Operands;

   // The name of a regular instruction depends
   // on the names of its operands. Hence, all
   // operands must be named first in the use-def
   // walk.

   // Collect operands.
   for (auto &Op : I->operands()) {
     if (auto *I = dyn_cast<Instruction>(Op)) {
       // Walk down the use-def chain.
       nameInstruction(I);
       Operands.push_back(I->getName());
     } else if (!isa<Function>(Op)) {
       // This must be an immediate value.
       std::string TextRepresentation;
       raw_string_ostream Stream(TextRepresentation);
       Op->printAsOperand(Stream, false);
       Operands.push_back(StringRef(Stream.str()));
     }
   }

   sortCommutativeOperands(I, Operands);

   // Initialize to a magic constant, so the state isn't zero.
   uint64_t Hash = MagicHashConstant;

   // Consider instruction opcode in the hash.
   Hash = hashing::detail::hash_16_bytes(Hash, I->getOpcode());

   // Operand opcodes for further sorting (commutative).
   SmallVector<int, 4> OperandsOpcodes;

   // Collect operand opcodes for hashing.
   for (auto &Op : I->operands())
     if (auto *I = dyn_cast<Instruction>(Op))
       OperandsOpcodes.push_back(I->getOpcode());

   sortCommutativeOperands(I, OperandsOpcodes);

   // Consider operand opcodes in the hash.
   for (const int Code : OperandsOpcodes)
     Hash = hashing::detail::hash_16_bytes(Hash, Code);

   // Base instruction name.
   SmallString<512> Name;
   Name.append("op" + std::to_string(Hash).substr(0, 5));

   // In case of CallInst, consider callee in the instruction name.
   if (const auto *CI = dyn_cast<CallInst>(I))
     if (const Function *F = CI->getCalledFunction())
       Name.append(F->getName());

   Name.append("(");
   for (size_t i = 0; i < Operands.size(); ++i) {
     Name.append(Operands[i]);

     if (i < Operands.size() - 1)
       Name.append(", ");
   }
   Name.append(")");

   if ((I->getName().empty() || Options.RenameAll) && !I->getType()->isVoidTy())
     I->setName(Name);
 }

 /// Shortens instruction's name. This method removes called function name from
 /// the instruction name and substitutes the call chain with a corresponding
 /// list of operands.
 ///
 /// Examples:
 /// op00000Callee(op00001Callee(...), vl00000Callee(1, 2), ...)  ->
 /// op00000(op00001, vl00000, ...) vl00000Callee(1, 2)  ->  vl00000(1, 2)
 ///
 /// This method omits output instructions and pre-output (instructions directly
 /// used by an output instruction) instructions (by default). By default it also
 /// does not affect user named instructions.
 ///
 /// \param I Instruction whose name will be folded.
 void IRNormalizer::foldInstructionName(Instruction *I) const {
   // If this flag is raised, fold all regular
   // instructions (including pre-outputs).
   if (!Options.FoldPreOutputs) {
     // Don't fold if one of the users is an output instruction.
     for (auto *U : I->users())
       if (auto *IU = dyn_cast<Instruction>(U))
         if (isOutput(IU))
           return;
   }

   // Don't fold if it is an output instruction or has no op prefix.
   if (isOutput(I) || I->getName().substr(0, 2) != "op")
     return;

   // Instruction operands.
   SmallVector<SmallString<64>, 4> Operands;

   for (auto &Op : I->operands()) {
     if (const auto *I = dyn_cast<Instruction>(Op)) {
       bool HasNormalName = I->getName().substr(0, 2) == "op" ||
                            I->getName().substr(0, 2) == "vl";

       Operands.push_back(HasNormalName ? I->getName().substr(0, 7)
                                        : I->getName());
     }
   }

   sortCommutativeOperands(I, Operands);

   SmallString<256> Name;
   Name.append(I->getName().substr(0, 7));

   Name.append("(");
   for (size_t i = 0; i < Operands.size(); ++i) {
     Name.append(Operands[i]);

     if (i < Operands.size() - 1)
       Name.append(", ");
   }
   Name.append(")");

   I->setName(Name);
 }

 /// Reorders instructions by walking up the tree from each operand of an output
 /// instruction and reducing the def-use distance.
 /// This method assumes that output instructions were collected top-down,
 /// otherwise the def-use chain may be broken.
 /// This method is a wrapper for recursive reorderInstruction().
 ///
 /// \see reorderInstruction()
 void IRNormalizer::reorderInstructions(Function &F) const {
   for (auto &BB : F) {
     LLVM_DEBUG(dbgs() << "Reordering instructions in basic block: "
                       << BB.getName() << "\n");
     // Find the source nodes of the DAG of instructions in this basic block.
     // Source nodes are instructions that have side effects, are terminators, or
     // don't have a parent in the DAG of instructions.
     //
     // We must iterate from the first to the last instruction otherwise side
     // effecting instructions could be reordered.

     std::stack<Instruction *> TopologicalSort;
     SmallPtrSet<const Instruction *, 32> Visited;
     for (auto &I : BB) {
       // First process side effecting and terminating instructions.
       if (!(isOutput(&I) || I.isTerminator()))
         continue;
       LLVM_DEBUG(dbgs() << "\tReordering from source effecting instruction: ";
                  I.dump());
       reorderDefinition(&I, TopologicalSort, Visited);
     }

     for (auto &I : BB) {
       // Process the remaining instructions.
       //
       // TODO: Do more a intelligent sorting of these instructions. For example,
       // seperate between dead instructinos and instructions used in another
       // block. Use properties of the CFG the order instructions that are used
       // in another block.
       if (Visited.contains(&I))
         continue;
       LLVM_DEBUG(dbgs() << "\tReordering from source instruction: "; I.dump());
       reorderDefinition(&I, TopologicalSort, Visited);
     }

     LLVM_DEBUG(dbgs() << "Inserting instructions into: " << BB.getName()
                       << "\n");
     // Reorder based on the topological sort.
     while (!TopologicalSort.empty()) {
       auto *Instruction = TopologicalSort.top();
       auto FirstNonPHIOrDbgOrAlloca = BB.getFirstNonPHIOrDbgOrAlloca();
       if (auto *Call = dyn_cast<CallInst>(&*FirstNonPHIOrDbgOrAlloca)) {
         if (Call->getIntrinsicID() ==
                 Intrinsic::experimental_convergence_entry ||
             Call->getIntrinsicID() == Intrinsic::experimental_convergence_loop)
           FirstNonPHIOrDbgOrAlloca++;
       }
       Instruction->moveBefore(FirstNonPHIOrDbgOrAlloca);
       TopologicalSort.pop();
     }
   }
 }

 void IRNormalizer::reorderDefinition(
     Instruction *Definition, std::stack<Instruction *> &TopologicalSort,
     SmallPtrSet<const Instruction *, 32> &Visited) const {
   if (Visited.contains(Definition))
     return;
   Visited.insert(Definition);

   {
     const auto *BasicBlock = Definition->getParent();
     const auto FirstNonPHIOrDbgOrAlloca =
         BasicBlock->getFirstNonPHIOrDbgOrAlloca();
     if (FirstNonPHIOrDbgOrAlloca == BasicBlock->end())
       return; // TODO: Is this necessary?
     if (Definition->comesBefore(&*FirstNonPHIOrDbgOrAlloca))
       return; // TODO: Do some kind of ordering for these instructions.
   }

   for (auto &Operand : Definition->operands()) {
     if (auto *Op = dyn_cast<Instruction>(Operand)) {
       if (Op->getParent() != Definition->getParent())
         continue; // Only reorder instruction within the same basic block
       reorderDefinition(Op, TopologicalSort, Visited);
     }
   }

   LLVM_DEBUG(dbgs() << "\t\tNext in topological sort: "; Definition->dump());
   if (Definition->isTerminator())
     return;
   if (auto *Call = dyn_cast<CallInst>(Definition)) {
     if (Call->isMustTailCall())
       return;
     if (Call->getIntrinsicID() == Intrinsic::experimental_deoptimize)
       return;
     if (Call->getIntrinsicID() == Intrinsic::experimental_convergence_entry)
       return;
     if (Call->getIntrinsicID() == Intrinsic::experimental_convergence_loop)
       return;
   }
   if (auto *BitCast = dyn_cast<BitCastInst>(Definition)) {
     if (auto *Call = dyn_cast<CallInst>(BitCast->getOperand(0))) {
       if (Call->isMustTailCall())
         return;
     }
   }

   TopologicalSort.emplace(Definition);
 }

 /// Reorders instruction's operands alphabetically. This method assumes
 /// that passed instruction is commutative. Changing the operand order
 /// in other instructions may change the semantics.
 ///
 /// \param I Instruction whose operands will be reordered.
 void IRNormalizer::reorderInstructionOperandsByNames(Instruction *I) const {
   // This method assumes that passed I is commutative,
   // changing the order of operands in other instructions
   // may change the semantics.

   // Instruction operands for further sorting.
   SmallVector<std::pair<std::string, Value *>, 4> Operands;

   // Collect operands.
   for (auto &Op : I->operands()) {
     if (auto *V = dyn_cast<Value>(Op)) {
       if (isa<Instruction>(V)) {
         // This is an an instruction.
         Operands.push_back(std::pair<std::string, Value *>(V->getName(), V));
       } else {
         std::string TextRepresentation;
         raw_string_ostream Stream(TextRepresentation);
         Op->printAsOperand(Stream, false);
         Operands.push_back(std::pair<std::string, Value *>(Stream.str(), V));
       }
     }
   }

   // Sort operands.
   sortCommutativeOperands(I, Operands);

   // Reorder operands.
   unsigned Position = 0;
   for (auto &Op : I->operands()) {
     Op.set(Operands[Position].second);
     Position += 1;
   }
 }

 /// Reorders PHI node's values according to the names of corresponding basic
 /// blocks.
 ///
 /// \param Phi PHI node to normalize.
 void IRNormalizer::reorderPHIIncomingValues(PHINode *Phi) const {
   // Values for further sorting.
   SmallVector<std::pair<Value *, BasicBlock *>, 2> Values;

   // Collect blocks and corresponding values.
   for (auto &BB : Phi->blocks()) {
     Value *V = Phi->getIncomingValueForBlock(BB);
     Values.push_back(std::pair<Value *, BasicBlock *>(V, BB));
   }

   // Sort values according to the name of a basic block.
   llvm::sort(Values, [](const std::pair<Value *, BasicBlock *> &LHS,
                         const std::pair<Value *, BasicBlock *> &RHS) {
     return LHS.second->getName() < RHS.second->getName();
   });

   // Swap.
   for (unsigned i = 0; i < Values.size(); ++i) {
     Phi->setIncomingBlock(i, Values[i].second);
     Phi->setIncomingValue(i, Values[i].first);
   }
 }

 /// Returns a vector of output instructions. An output is an instruction which
 /// has side-effects or is ReturnInst. Uses isOutput().
 ///
 /// \see isOutput()
 /// \param F Function to collect outputs from.
 SmallVector<Instruction *, 16>
 IRNormalizer::collectOutputInstructions(Function &F) const {
   // Output instructions are collected top-down in each function,
   // any change may break the def-use chain in reordering methods.
   SmallVector<Instruction *, 16> Outputs;
   for (auto &I : instructions(F))
     if (isOutput(&I))
       Outputs.push_back(&I);
   return Outputs;
 }

 /// Helper method checking whether the instruction may have side effects or is
 /// ReturnInst.
 ///
 /// \param I Considered instruction.
 bool IRNormalizer::isOutput(const Instruction *I) const {
   // Outputs are such instructions which may have side effects or is ReturnInst.
   return I->mayHaveSideEffects() || isa<ReturnInst>(I);
 }

 /// Helper method checking whether the instruction has users and only
 /// immediate operands.
 ///
 /// \param I Considered instruction.
 bool IRNormalizer::isInitialInstruction(const Instruction *I) const {
   // Initial instructions are such instructions whose values are used by
   // other instructions, yet they only depend on immediate values.
   return !I->user_empty() && hasOnlyImmediateOperands(I);
 }

 /// Helper method checking whether the instruction has only immediate operands.
 ///
 /// \param I Considered instruction.
 bool IRNormalizer::hasOnlyImmediateOperands(const Instruction *I) const {
   for (const auto &Op : I->operands())
     if (isa<Instruction>(Op))
       return false; // Found non-immediate operand (instruction).
   return true;
 }

 /// Helper method returning indices (distance from the beginning of the basic
 /// block) of outputs using the \p I (eliminates repetitions). Walks down the
 /// def-use tree recursively.
 ///
 /// \param I Considered instruction.
 /// \param Visited Set of visited instructions.
 SetVector<int> IRNormalizer::getOutputFootprint(
     Instruction *I, SmallPtrSet<const Instruction *, 32> &Visited) const {

   // Vector containing indexes of outputs (no repetitions),
   // which use I in the order of walking down the def-use tree.
   SetVector<int> Outputs;

   if (!Visited.count(I)) {
     Visited.insert(I);

     if (isOutput(I)) {
       // Gets output instruction's parent function.
       Function *Func = I->getParent()->getParent();

       // Finds and inserts the index of the output to the vector.
       unsigned Count = 0;
       for (const auto &B : *Func) {
         for (const auto &E : B) {
           if (&E == I)
             Outputs.insert(Count);
           Count += 1;
         }
       }

       // Returns to the used instruction.
       return Outputs;
     }

     for (auto *U : I->users()) {
       if (auto *UI = dyn_cast<Instruction>(U)) {
         // Vector for outputs which use UI.
         SetVector<int> OutputsUsingUI = getOutputFootprint(UI, Visited);
         // Insert the indexes of outputs using UI.
         Outputs.insert_range(OutputsUsingUI);
       }
     }
   }

   // Return to the used instruction.
   return Outputs;
 }

 PreservedAnalyses IRNormalizerPass::run(Function &F,
                                         FunctionAnalysisManager &AM) const {
   IRNormalizer(Options).runOnFunction(F);
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
   return PA;
 }