llvm/lib/CodeGen/ExpandReductions.cpp - llvm-project.git - Git at Google

 //===- ExpandReductions.cpp - Expand reduction intrinsics -----------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This pass implements IR expansion for reduction intrinsics, allowing targets
 // to enable the intrinsics until just before codegen.
 //
 //===----------------------------------------------------------------------===//

 #include "llvm/CodeGen/ExpandReductions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"

 using namespace llvm;

 namespace {

 bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
   bool Changed = false;
   SmallVector<IntrinsicInst *, 4> Worklist;
   for (auto &I : instructions(F)) {
     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
       switch (II->getIntrinsicID()) {
       default: break;
       case Intrinsic::vector_reduce_fadd:
       case Intrinsic::vector_reduce_fmul:
       case Intrinsic::vector_reduce_add:
       case Intrinsic::vector_reduce_mul:
       case Intrinsic::vector_reduce_and:
       case Intrinsic::vector_reduce_or:
       case Intrinsic::vector_reduce_xor:
       case Intrinsic::vector_reduce_smax:
       case Intrinsic::vector_reduce_smin:
       case Intrinsic::vector_reduce_umax:
       case Intrinsic::vector_reduce_umin:
       case Intrinsic::vector_reduce_fmax:
       case Intrinsic::vector_reduce_fmin:
         if (TTI->shouldExpandReduction(II))
           Worklist.push_back(II);

         break;
       }
     }
   }

   for (auto *II : Worklist) {
     FastMathFlags FMF =
         isa<FPMathOperator>(II) ? II->getFastMathFlags() : FastMathFlags{};
     Intrinsic::ID ID = II->getIntrinsicID();
     RecurKind RK = getMinMaxReductionRecurKind(ID);

     Value *Rdx = nullptr;
     IRBuilder<> Builder(II);
     IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
     Builder.setFastMathFlags(FMF);
     switch (ID) {
     default: llvm_unreachable("Unexpected intrinsic!");
     case Intrinsic::vector_reduce_fadd:
     case Intrinsic::vector_reduce_fmul: {
       // FMFs must be attached to the call, otherwise it's an ordered reduction
       // and it can't be handled by generating a shuffle sequence.
       Value *Acc = II->getArgOperand(0);
       Value *Vec = II->getArgOperand(1);
       unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
       if (!FMF.allowReassoc())
         Rdx = getOrderedReduction(Builder, Acc, Vec, RdxOpcode, RK);
       else {
         if (!isPowerOf2_32(
                 cast<FixedVectorType>(Vec->getType())->getNumElements()))
           continue;
         Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK);
         Rdx = Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, Acc, Rdx,
                                   "bin.rdx");
       }
       break;
     }
     case Intrinsic::vector_reduce_and:
     case Intrinsic::vector_reduce_or: {
       // Canonicalize logical or/and reductions:
       // Or reduction for i1 is represented as:
       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
       // %res = cmp ne iReduxWidth %val, 0
       // And reduction for i1 is represented as:
       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
       // %res = cmp eq iReduxWidth %val, 11111
       Value *Vec = II->getArgOperand(0);
       auto *FTy = cast<FixedVectorType>(Vec->getType());
       unsigned NumElts = FTy->getNumElements();
       if (!isPowerOf2_32(NumElts))
         continue;

       if (FTy->getElementType() == Builder.getInt1Ty()) {
         Rdx = Builder.CreateBitCast(Vec, Builder.getIntNTy(NumElts));
         if (ID == Intrinsic::vector_reduce_and) {
           Rdx = Builder.CreateICmpEQ(
               Rdx, ConstantInt::getAllOnesValue(Rdx->getType()));
         } else {
           assert(ID == Intrinsic::vector_reduce_or && "Expected or reduction.");
           Rdx = Builder.CreateIsNotNull(Rdx);
         }
         break;
       }
       unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
       Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK);
       break;
     }
     case Intrinsic::vector_reduce_add:
     case Intrinsic::vector_reduce_mul:
     case Intrinsic::vector_reduce_xor:
     case Intrinsic::vector_reduce_smax:
     case Intrinsic::vector_reduce_smin:
     case Intrinsic::vector_reduce_umax:
     case Intrinsic::vector_reduce_umin: {
       Value *Vec = II->getArgOperand(0);
       if (!isPowerOf2_32(
               cast<FixedVectorType>(Vec->getType())->getNumElements()))
         continue;
       unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
       Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK);
       break;
     }
     case Intrinsic::vector_reduce_fmax:
     case Intrinsic::vector_reduce_fmin: {
       // We require "nnan" to use a shuffle reduction; "nsz" is implied by the
       // semantics of the reduction.
       Value *Vec = II->getArgOperand(0);
       if (!isPowerOf2_32(
               cast<FixedVectorType>(Vec->getType())->getNumElements()) ||
           !FMF.noNaNs())
         continue;
       unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
       Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK);
       break;
     }
     }
     II->replaceAllUsesWith(Rdx);
     II->eraseFromParent();
     Changed = true;
   }
   return Changed;
 }

 class ExpandReductions : public FunctionPass {
 public:
   static char ID;
   ExpandReductions() : FunctionPass(ID) {
     initializeExpandReductionsPass(*PassRegistry::getPassRegistry());
   }

   bool runOnFunction(Function &F) override {
     const auto *TTI =&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     return expandReductions(F, TTI);
   }

   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetTransformInfoWrapperPass>();
     AU.setPreservesCFG();
   }
 };
 }

 char ExpandReductions::ID;
 INITIALIZE_PASS_BEGIN(ExpandReductions, "expand-reductions",
                       "Expand reduction intrinsics", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(ExpandReductions, "expand-reductions",
                     "Expand reduction intrinsics", false, false)

 FunctionPass *llvm::createExpandReductionsPass() {
   return new ExpandReductions();
 }

 PreservedAnalyses ExpandReductionsPass::run(Function &F,
                                             FunctionAnalysisManager &AM) {
   const auto &TTI = AM.getResult<TargetIRAnalysis>(F);
   if (!expandReductions(F, &TTI))
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
   return PA;
 }
	//===- ExpandReductions.cpp - Expand reduction intrinsics -----------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass implements IR expansion for reduction intrinsics, allowing targets
	// to enable the intrinsics until just before codegen.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/CodeGen/ExpandReductions.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/CodeGen/Passes.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/InstIterator.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/InitializePasses.h"
	#include "llvm/Pass.h"
	#include "llvm/Transforms/Utils/LoopUtils.h"

	using namespace llvm;

	namespace {

	bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
	bool Changed = false;
	SmallVector<IntrinsicInst *, 4> Worklist;
	for (auto &I : instructions(F)) {
	if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
	switch (II->getIntrinsicID()) {
	default: break;
	case Intrinsic::vector_reduce_fadd:
	case Intrinsic::vector_reduce_fmul:
	case Intrinsic::vector_reduce_add:
	case Intrinsic::vector_reduce_mul:
	case Intrinsic::vector_reduce_and:
	case Intrinsic::vector_reduce_or:
	case Intrinsic::vector_reduce_xor:
	case Intrinsic::vector_reduce_smax:
	case Intrinsic::vector_reduce_smin:
	case Intrinsic::vector_reduce_umax:
	case Intrinsic::vector_reduce_umin:
	case Intrinsic::vector_reduce_fmax:
	case Intrinsic::vector_reduce_fmin:
	if (TTI->shouldExpandReduction(II))
	Worklist.push_back(II);

	break;
	}
	}
	}

	for (auto *II : Worklist) {
	FastMathFlags FMF =
	isa<FPMathOperator>(II) ? II->getFastMathFlags() : FastMathFlags{};
	Intrinsic::ID ID = II->getIntrinsicID();
	RecurKind RK = getMinMaxReductionRecurKind(ID);

	Value *Rdx = nullptr;
	IRBuilder<> Builder(II);
	IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
	Builder.setFastMathFlags(FMF);
	switch (ID) {
	default: llvm_unreachable("Unexpected intrinsic!");
	case Intrinsic::vector_reduce_fadd:
	case Intrinsic::vector_reduce_fmul: {
	// FMFs must be attached to the call, otherwise it's an ordered reduction
	// and it can't be handled by generating a shuffle sequence.
	Value *Acc = II->getArgOperand(0);
	Value *Vec = II->getArgOperand(1);
	unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
	if (!FMF.allowReassoc())
	Rdx = getOrderedReduction(Builder, Acc, Vec, RdxOpcode, RK);
	else {
	if (!isPowerOf2_32(
	cast<FixedVectorType>(Vec->getType())->getNumElements()))
	continue;
	Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK);
	Rdx = Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, Acc, Rdx,
	"bin.rdx");
	}
	break;
	}
	case Intrinsic::vector_reduce_and:
	case Intrinsic::vector_reduce_or: {
	// Canonicalize logical or/and reductions:
	// Or reduction for i1 is represented as:
	// %val = bitcast <ReduxWidth x i1> to iReduxWidth
	// %res = cmp ne iReduxWidth %val, 0
	// And reduction for i1 is represented as:
	// %val = bitcast <ReduxWidth x i1> to iReduxWidth
	// %res = cmp eq iReduxWidth %val, 11111
	Value *Vec = II->getArgOperand(0);
	auto *FTy = cast<FixedVectorType>(Vec->getType());
	unsigned NumElts = FTy->getNumElements();
	if (!isPowerOf2_32(NumElts))
	continue;

	if (FTy->getElementType() == Builder.getInt1Ty()) {
	Rdx = Builder.CreateBitCast(Vec, Builder.getIntNTy(NumElts));
	if (ID == Intrinsic::vector_reduce_and) {
	Rdx = Builder.CreateICmpEQ(
	Rdx, ConstantInt::getAllOnesValue(Rdx->getType()));
	} else {
	assert(ID == Intrinsic::vector_reduce_or && "Expected or reduction.");
	Rdx = Builder.CreateIsNotNull(Rdx);
	}
	break;
	}
	unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
	Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK);
	break;
	}
	case Intrinsic::vector_reduce_add:
	case Intrinsic::vector_reduce_mul:
	case Intrinsic::vector_reduce_xor:
	case Intrinsic::vector_reduce_smax:
	case Intrinsic::vector_reduce_smin:
	case Intrinsic::vector_reduce_umax:
	case Intrinsic::vector_reduce_umin: {
	Value *Vec = II->getArgOperand(0);
	if (!isPowerOf2_32(
	cast<FixedVectorType>(Vec->getType())->getNumElements()))
	continue;
	unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
	Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK);
	break;
	}
	case Intrinsic::vector_reduce_fmax:
	case Intrinsic::vector_reduce_fmin: {
	// We require "nnan" to use a shuffle reduction; "nsz" is implied by the
	// semantics of the reduction.
	Value *Vec = II->getArgOperand(0);
	if (!isPowerOf2_32(
	cast<FixedVectorType>(Vec->getType())->getNumElements()) \|\|
	!FMF.noNaNs())
	continue;
	unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
	Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK);
	break;
	}
	}
	II->replaceAllUsesWith(Rdx);
	II->eraseFromParent();
	Changed = true;
	}
	return Changed;
	}

	class ExpandReductions : public FunctionPass {
	public:
	static char ID;
	ExpandReductions() : FunctionPass(ID) {
	initializeExpandReductionsPass(*PassRegistry::getPassRegistry());
	}

	bool runOnFunction(Function &F) override {
	const auto *TTI =&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
	return expandReductions(F, TTI);
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.addRequired<TargetTransformInfoWrapperPass>();
	AU.setPreservesCFG();
	}
	};
	}

	char ExpandReductions::ID;
	INITIALIZE_PASS_BEGIN(ExpandReductions, "expand-reductions",
	"Expand reduction intrinsics", false, false)
	INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
	INITIALIZE_PASS_END(ExpandReductions, "expand-reductions",
	"Expand reduction intrinsics", false, false)

	FunctionPass *llvm::createExpandReductionsPass() {
	return new ExpandReductions();
	}

	PreservedAnalyses ExpandReductionsPass::run(Function &F,
	FunctionAnalysisManager &AM) {
	const auto &TTI = AM.getResult<TargetIRAnalysis>(F);
	if (!expandReductions(F, &TTI))
	return PreservedAnalyses::all();
	PreservedAnalyses PA;
	PA.preserveSet<CFGAnalyses>();
	return PA;
	}