| //===- BypassSlowDivision.cpp - Bypass slow division ----------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file contains an optimization for div and rem on architectures that |
| // execute short instructions significantly faster than longer instructions. |
| // For example, on Intel Atom 32-bit divides are slow enough that during |
| // runtime it is profitable to check the value of the operands, and if they are |
| // positive and less than 256 use an unsigned 8-bit divide. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "llvm/Transforms/Utils/BypassSlowDivision.h" |
| #include "llvm/ADT/DenseMap.h" |
| #include "llvm/ADT/None.h" |
| #include "llvm/ADT/Optional.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/ADT/SmallPtrSet.h" |
| #include "llvm/Transforms/Utils/Local.h" |
| #include "llvm/Analysis/ValueTracking.h" |
| #include "llvm/IR/BasicBlock.h" |
| #include "llvm/IR/Constants.h" |
| #include "llvm/IR/DerivedTypes.h" |
| #include "llvm/IR/Function.h" |
| #include "llvm/IR/IRBuilder.h" |
| #include "llvm/IR/Instruction.h" |
| #include "llvm/IR/Instructions.h" |
| #include "llvm/IR/Module.h" |
| #include "llvm/IR/Type.h" |
| #include "llvm/IR/Value.h" |
| #include "llvm/Support/Casting.h" |
| #include "llvm/Support/KnownBits.h" |
| #include <cassert> |
| #include <cstdint> |
| |
| using namespace llvm; |
| |
| #define DEBUG_TYPE "bypass-slow-division" |
| |
| namespace { |
| |
| struct QuotRemPair { |
| Value *Quotient; |
| Value *Remainder; |
| |
| QuotRemPair(Value *InQuotient, Value *InRemainder) |
| : Quotient(InQuotient), Remainder(InRemainder) {} |
| }; |
| |
| /// A quotient and remainder, plus a BB from which they logically "originate". |
| /// If you use Quotient or Remainder in a Phi node, you should use BB as its |
| /// corresponding predecessor. |
| struct QuotRemWithBB { |
| BasicBlock *BB = nullptr; |
| Value *Quotient = nullptr; |
| Value *Remainder = nullptr; |
| }; |
| |
| using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>; |
| using BypassWidthsTy = DenseMap<unsigned, unsigned>; |
| using VisitedSetTy = SmallPtrSet<Instruction *, 4>; |
| |
| enum ValueRange { |
| /// Operand definitely fits into BypassType. No runtime checks are needed. |
| VALRNG_KNOWN_SHORT, |
| /// A runtime check is required, as value range is unknown. |
| VALRNG_UNKNOWN, |
| /// Operand is unlikely to fit into BypassType. The bypassing should be |
| /// disabled. |
| VALRNG_LIKELY_LONG |
| }; |
| |
| class FastDivInsertionTask { |
| bool IsValidTask = false; |
| Instruction *SlowDivOrRem = nullptr; |
| IntegerType *BypassType = nullptr; |
| BasicBlock *MainBB = nullptr; |
| |
| bool isHashLikeValue(Value *V, VisitedSetTy &Visited); |
| ValueRange getValueRange(Value *Op, VisitedSetTy &Visited); |
| QuotRemWithBB createSlowBB(BasicBlock *Successor); |
| QuotRemWithBB createFastBB(BasicBlock *Successor); |
| QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS, |
| BasicBlock *PhiBB); |
| Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2); |
| Optional<QuotRemPair> insertFastDivAndRem(); |
| |
| bool isSignedOp() { |
| return SlowDivOrRem->getOpcode() == Instruction::SDiv || |
| SlowDivOrRem->getOpcode() == Instruction::SRem; |
| } |
| |
| bool isDivisionOp() { |
| return SlowDivOrRem->getOpcode() == Instruction::SDiv || |
| SlowDivOrRem->getOpcode() == Instruction::UDiv; |
| } |
| |
| Type *getSlowType() { return SlowDivOrRem->getType(); } |
| |
| public: |
| FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths); |
| |
| Value *getReplacement(DivCacheTy &Cache); |
| }; |
| |
| } // end anonymous namespace |
| |
| FastDivInsertionTask::FastDivInsertionTask(Instruction *I, |
| const BypassWidthsTy &BypassWidths) { |
| switch (I->getOpcode()) { |
| case Instruction::UDiv: |
| case Instruction::SDiv: |
| case Instruction::URem: |
| case Instruction::SRem: |
| SlowDivOrRem = I; |
| break; |
| default: |
| // I is not a div/rem operation. |
| return; |
| } |
| |
| // Skip division on vector types. Only optimize integer instructions. |
| IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType()); |
| if (!SlowType) |
| return; |
| |
| // Skip if this bitwidth is not bypassed. |
| auto BI = BypassWidths.find(SlowType->getBitWidth()); |
| if (BI == BypassWidths.end()) |
| return; |
| |
| // Get type for div/rem instruction with bypass bitwidth. |
| IntegerType *BT = IntegerType::get(I->getContext(), BI->second); |
| BypassType = BT; |
| |
| // The original basic block. |
| MainBB = I->getParent(); |
| |
| // The instruction is indeed a slow div or rem operation. |
| IsValidTask = true; |
| } |
| |
| /// Reuses previously-computed dividend or remainder from the current BB if |
| /// operands and operation are identical. Otherwise calls insertFastDivAndRem to |
| /// perform the optimization and caches the resulting dividend and remainder. |
| /// If no replacement can be generated, nullptr is returned. |
| Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) { |
| // First, make sure that the task is valid. |
| if (!IsValidTask) |
| return nullptr; |
| |
| // Then, look for a value in Cache. |
| Value *Dividend = SlowDivOrRem->getOperand(0); |
| Value *Divisor = SlowDivOrRem->getOperand(1); |
| DivRemMapKey Key(isSignedOp(), Dividend, Divisor); |
| auto CacheI = Cache.find(Key); |
| |
| if (CacheI == Cache.end()) { |
| // If previous instance does not exist, try to insert fast div. |
| Optional<QuotRemPair> OptResult = insertFastDivAndRem(); |
| // Bail out if insertFastDivAndRem has failed. |
| if (!OptResult) |
| return nullptr; |
| CacheI = Cache.insert({Key, *OptResult}).first; |
| } |
| |
| QuotRemPair &Value = CacheI->second; |
| return isDivisionOp() ? Value.Quotient : Value.Remainder; |
| } |
| |
| /// Check if a value looks like a hash. |
| /// |
| /// The routine is expected to detect values computed using the most common hash |
| /// algorithms. Typically, hash computations end with one of the following |
| /// instructions: |
| /// |
| /// 1) MUL with a constant wider than BypassType |
| /// 2) XOR instruction |
| /// |
| /// And even if we are wrong and the value is not a hash, it is still quite |
| /// unlikely that such values will fit into BypassType. |
| /// |
| /// To detect string hash algorithms like FNV we have to look through PHI-nodes. |
| /// It is implemented as a depth-first search for values that look neither long |
| /// nor hash-like. |
| bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) { |
| Instruction *I = dyn_cast<Instruction>(V); |
| if (!I) |
| return false; |
| |
| switch (I->getOpcode()) { |
| case Instruction::Xor: |
| return true; |
| case Instruction::Mul: { |
| // After Constant Hoisting pass, long constants may be represented as |
| // bitcast instructions. As a result, some constants may look like an |
| // instruction at first, and an additional check is necessary to find out if |
| // an operand is actually a constant. |
| Value *Op1 = I->getOperand(1); |
| ConstantInt *C = dyn_cast<ConstantInt>(Op1); |
| if (!C && isa<BitCastInst>(Op1)) |
| C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0)); |
| return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth(); |
| } |
| case Instruction::PHI: |
| // Stop IR traversal in case of a crazy input code. This limits recursion |
| // depth. |
| if (Visited.size() >= 16) |
| return false; |
| // Do not visit nodes that have been visited already. We return true because |
| // it means that we couldn't find any value that doesn't look hash-like. |
| if (Visited.find(I) != Visited.end()) |
| return true; |
| Visited.insert(I); |
| return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) { |
| // Ignore undef values as they probably don't affect the division |
| // operands. |
| return getValueRange(V, Visited) == VALRNG_LIKELY_LONG || |
| isa<UndefValue>(V); |
| }); |
| default: |
| return false; |
| } |
| } |
| |
| /// Check if an integer value fits into our bypass type. |
| ValueRange FastDivInsertionTask::getValueRange(Value *V, |
| VisitedSetTy &Visited) { |
| unsigned ShortLen = BypassType->getBitWidth(); |
| unsigned LongLen = V->getType()->getIntegerBitWidth(); |
| |
| assert(LongLen > ShortLen && "Value type must be wider than BypassType"); |
| unsigned HiBits = LongLen - ShortLen; |
| |
| const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout(); |
| KnownBits Known(LongLen); |
| |
| computeKnownBits(V, Known, DL); |
| |
| if (Known.countMinLeadingZeros() >= HiBits) |
| return VALRNG_KNOWN_SHORT; |
| |
| if (Known.countMaxLeadingZeros() < HiBits) |
| return VALRNG_LIKELY_LONG; |
| |
| // Long integer divisions are often used in hashtable implementations. It's |
| // not worth bypassing such divisions because hash values are extremely |
| // unlikely to have enough leading zeros. The call below tries to detect |
| // values that are unlikely to fit BypassType (including hashes). |
| if (isHashLikeValue(V, Visited)) |
| return VALRNG_LIKELY_LONG; |
| |
| return VALRNG_UNKNOWN; |
| } |
| |
| /// Add new basic block for slow div and rem operations and put it before |
| /// SuccessorBB. |
| QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) { |
| QuotRemWithBB DivRemPair; |
| DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", |
| MainBB->getParent(), SuccessorBB); |
| IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); |
| |
| Value *Dividend = SlowDivOrRem->getOperand(0); |
| Value *Divisor = SlowDivOrRem->getOperand(1); |
| |
| if (isSignedOp()) { |
| DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor); |
| DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor); |
| } else { |
| DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor); |
| DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor); |
| } |
| |
| Builder.CreateBr(SuccessorBB); |
| return DivRemPair; |
| } |
| |
| /// Add new basic block for fast div and rem operations and put it before |
| /// SuccessorBB. |
| QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) { |
| QuotRemWithBB DivRemPair; |
| DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", |
| MainBB->getParent(), SuccessorBB); |
| IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); |
| |
| Value *Dividend = SlowDivOrRem->getOperand(0); |
| Value *Divisor = SlowDivOrRem->getOperand(1); |
| Value *ShortDivisorV = |
| Builder.CreateCast(Instruction::Trunc, Divisor, BypassType); |
| Value *ShortDividendV = |
| Builder.CreateCast(Instruction::Trunc, Dividend, BypassType); |
| |
| // udiv/urem because this optimization only handles positive numbers. |
| Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV); |
| Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV); |
| DivRemPair.Quotient = |
| Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType()); |
| DivRemPair.Remainder = |
| Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType()); |
| Builder.CreateBr(SuccessorBB); |
| |
| return DivRemPair; |
| } |
| |
| /// Creates Phi nodes for result of Div and Rem. |
| QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS, |
| QuotRemWithBB &RHS, |
| BasicBlock *PhiBB) { |
| IRBuilder<> Builder(PhiBB, PhiBB->begin()); |
| PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2); |
| QuoPhi->addIncoming(LHS.Quotient, LHS.BB); |
| QuoPhi->addIncoming(RHS.Quotient, RHS.BB); |
| PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2); |
| RemPhi->addIncoming(LHS.Remainder, LHS.BB); |
| RemPhi->addIncoming(RHS.Remainder, RHS.BB); |
| return QuotRemPair(QuoPhi, RemPhi); |
| } |
| |
| /// Creates a runtime check to test whether both the divisor and dividend fit |
| /// into BypassType. The check is inserted at the end of MainBB. True return |
| /// value means that the operands fit. Either of the operands may be NULL if it |
| /// doesn't need a runtime check. |
| Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) { |
| assert((Op1 || Op2) && "Nothing to check"); |
| IRBuilder<> Builder(MainBB, MainBB->end()); |
| |
| Value *OrV; |
| if (Op1 && Op2) |
| OrV = Builder.CreateOr(Op1, Op2); |
| else |
| OrV = Op1 ? Op1 : Op2; |
| |
| // BitMask is inverted to check if the operands are |
| // larger than the bypass type |
| uint64_t BitMask = ~BypassType->getBitMask(); |
| Value *AndV = Builder.CreateAnd(OrV, BitMask); |
| |
| // Compare operand values |
| Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0); |
| return Builder.CreateICmpEQ(AndV, ZeroV); |
| } |
| |
| /// Substitutes the div/rem instruction with code that checks the value of the |
| /// operands and uses a shorter-faster div/rem instruction when possible. |
| Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() { |
| Value *Dividend = SlowDivOrRem->getOperand(0); |
| Value *Divisor = SlowDivOrRem->getOperand(1); |
| |
| VisitedSetTy SetL; |
| ValueRange DividendRange = getValueRange(Dividend, SetL); |
| if (DividendRange == VALRNG_LIKELY_LONG) |
| return None; |
| |
| VisitedSetTy SetR; |
| ValueRange DivisorRange = getValueRange(Divisor, SetR); |
| if (DivisorRange == VALRNG_LIKELY_LONG) |
| return None; |
| |
| bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT); |
| bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT); |
| |
| if (DividendShort && DivisorShort) { |
| // If both operands are known to be short then just replace the long |
| // division with a short one in-place. Since we're not introducing control |
| // flow in this case, narrowing the division is always a win, even if the |
| // divisor is a constant (and will later get replaced by a multiplication). |
| |
| IRBuilder<> Builder(SlowDivOrRem); |
| Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType); |
| Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType); |
| Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor); |
| Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor); |
| Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType()); |
| Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType()); |
| return QuotRemPair(ExtDiv, ExtRem); |
| } |
| |
| if (isa<ConstantInt>(Divisor)) { |
| // If the divisor is not a constant, DAGCombiner will convert it to a |
| // multiplication by a magic constant. It isn't clear if it is worth |
| // introducing control flow to get a narrower multiply. |
| return None; |
| } |
| |
| // After Constant Hoisting pass, long constants may be represented as |
| // bitcast instructions. As a result, some constants may look like an |
| // instruction at first, and an additional check is necessary to find out if |
| // an operand is actually a constant. |
| if (auto *BCI = dyn_cast<BitCastInst>(Divisor)) |
| if (BCI->getParent() == SlowDivOrRem->getParent() && |
| isa<ConstantInt>(BCI->getOperand(0))) |
| return None; |
| |
| if (DividendShort && !isSignedOp()) { |
| // If the division is unsigned and Dividend is known to be short, then |
| // either |
| // 1) Divisor is less or equal to Dividend, and the result can be computed |
| // with a short division. |
| // 2) Divisor is greater than Dividend. In this case, no division is needed |
| // at all: The quotient is 0 and the remainder is equal to Dividend. |
| // |
| // So instead of checking at runtime whether Divisor fits into BypassType, |
| // we emit a runtime check to differentiate between these two cases. This |
| // lets us entirely avoid a long div. |
| |
| // Split the basic block before the div/rem. |
| BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); |
| // Remove the unconditional branch from MainBB to SuccessorBB. |
| MainBB->getInstList().back().eraseFromParent(); |
| QuotRemWithBB Long; |
| Long.BB = MainBB; |
| Long.Quotient = ConstantInt::get(getSlowType(), 0); |
| Long.Remainder = Dividend; |
| QuotRemWithBB Fast = createFastBB(SuccessorBB); |
| QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB); |
| IRBuilder<> Builder(MainBB, MainBB->end()); |
| Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor); |
| Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB); |
| return Result; |
| } else { |
| // General case. Create both slow and fast div/rem pairs and choose one of |
| // them at runtime. |
| |
| // Split the basic block before the div/rem. |
| BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); |
| // Remove the unconditional branch from MainBB to SuccessorBB. |
| MainBB->getInstList().back().eraseFromParent(); |
| QuotRemWithBB Fast = createFastBB(SuccessorBB); |
| QuotRemWithBB Slow = createSlowBB(SuccessorBB); |
| QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB); |
| Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend, |
| DivisorShort ? nullptr : Divisor); |
| IRBuilder<> Builder(MainBB, MainBB->end()); |
| Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); |
| return Result; |
| } |
| } |
| |
| /// This optimization identifies DIV/REM instructions in a BB that can be |
| /// profitably bypassed and carried out with a shorter, faster divide. |
| bool llvm::bypassSlowDivision(BasicBlock *BB, |
| const BypassWidthsTy &BypassWidths) { |
| DivCacheTy PerBBDivCache; |
| |
| bool MadeChange = false; |
| Instruction* Next = &*BB->begin(); |
| while (Next != nullptr) { |
| // We may add instructions immediately after I, but we want to skip over |
| // them. |
| Instruction* I = Next; |
| Next = Next->getNextNode(); |
| |
| FastDivInsertionTask Task(I, BypassWidths); |
| if (Value *Replacement = Task.getReplacement(PerBBDivCache)) { |
| I->replaceAllUsesWith(Replacement); |
| I->eraseFromParent(); |
| MadeChange = true; |
| } |
| } |
| |
| // Above we eagerly create divs and rems, as pairs, so that we can efficiently |
| // create divrem machine instructions. Now erase any unused divs / rems so we |
| // don't leave extra instructions sitting around. |
| for (auto &KV : PerBBDivCache) |
| for (Value *V : {KV.second.Quotient, KV.second.Remainder}) |
| RecursivelyDeleteTriviallyDeadInstructions(V); |
| |
| return MadeChange; |
| } |