mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp - llvm-project - Git at Google

 //===- ExpandOps.cpp - Pass to legalize Arith ops for LLVM lowering --===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "mlir/Dialect/Arith/Transforms/Passes.h"

 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Transforms/DialectConversion.h"

 namespace mlir {
 namespace arith {
 #define GEN_PASS_DEF_ARITHEXPANDOPSPASS
 #include "mlir/Dialect/Arith/Transforms/Passes.h.inc"
 } // namespace arith
 } // namespace mlir

 using namespace mlir;

 /// Create an integer or index constant.
 static Value createConst(Location loc, Type type, int value,
                          PatternRewriter &rewriter) {
   auto attr = rewriter.getIntegerAttr(getElementTypeOrSelf(type), value);
   if (auto shapedTy = dyn_cast<ShapedType>(type)) {
     return rewriter.create<arith::ConstantOp>(
         loc, DenseElementsAttr::get(shapedTy, attr));
   }

   return rewriter.create<arith::ConstantOp>(loc, attr);
 }

 namespace {

 /// Expands CeilDivUIOp (n, m) into
 ///  n == 0 ? 0 : ((n-1) / m) + 1
 struct CeilDivUIOpConverter : public OpRewritePattern<arith::CeilDivUIOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(arith::CeilDivUIOp op,
                                 PatternRewriter &rewriter) const final {
     Location loc = op.getLoc();
     Value a = op.getLhs();
     Value b = op.getRhs();
     Value zero = createConst(loc, a.getType(), 0, rewriter);
     Value compare =
         rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, a, zero);
     Value one = createConst(loc, a.getType(), 1, rewriter);
     Value minusOne = rewriter.create<arith::SubIOp>(loc, a, one);
     Value quotient = rewriter.create<arith::DivUIOp>(loc, minusOne, b);
     Value plusOne = rewriter.create<arith::AddIOp>(loc, quotient, one);
     rewriter.replaceOpWithNewOp<arith::SelectOp>(op, compare, zero, plusOne);
     return success();
   }
 };

 /// Expands CeilDivSIOp (n, m) into
 ///   1) x = (m > 0) ? -1 : 1
 ///   2) (n*m>0) ? ((n+x) / m) + 1 : - (-n / m)
 struct CeilDivSIOpConverter : public OpRewritePattern<arith::CeilDivSIOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(arith::CeilDivSIOp op,
                                 PatternRewriter &rewriter) const final {
     Location loc = op.getLoc();
     Type type = op.getType();
     Value a = op.getLhs();
     Value b = op.getRhs();
     Value plusOne = createConst(loc, type, 1, rewriter);
     Value zero = createConst(loc, type, 0, rewriter);
     Value minusOne = createConst(loc, type, -1, rewriter);
     // Compute x = (b>0) ? -1 : 1.
     Value compare =
         rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sgt, b, zero);
     Value x = rewriter.create<arith::SelectOp>(loc, compare, minusOne, plusOne);
     // Compute positive res: 1 + ((x+a)/b).
     Value xPlusA = rewriter.create<arith::AddIOp>(loc, x, a);
     Value xPlusADivB = rewriter.create<arith::DivSIOp>(loc, xPlusA, b);
     Value posRes = rewriter.create<arith::AddIOp>(loc, plusOne, xPlusADivB);
     // Compute negative res: - ((-a)/b).
     Value minusA = rewriter.create<arith::SubIOp>(loc, zero, a);
     Value minusADivB = rewriter.create<arith::DivSIOp>(loc, minusA, b);
     Value negRes = rewriter.create<arith::SubIOp>(loc, zero, minusADivB);
     // Result is (a*b>0) ? pos result : neg result.
     // Note, we want to avoid using a*b because of possible overflow.
     // The case that matters are a>0, a==0, a<0, b>0 and b<0. We do
     // not particuliarly care if a*b<0 is true or false when b is zero
     // as this will result in an illegal divide. So `a*b<0` can be reformulated
     // as `(a<0 && b<0) || (a>0 && b>0)' or `(a<0 && b<0) || (a>0 && b>=0)'.
     // We pick the first expression here.
     Value aNeg =
         rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt, a, zero);
     Value aPos =
         rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sgt, a, zero);
     Value bNeg =
         rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt, b, zero);
     Value bPos =
         rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sgt, b, zero);
     Value firstTerm = rewriter.create<arith::AndIOp>(loc, aNeg, bNeg);
     Value secondTerm = rewriter.create<arith::AndIOp>(loc, aPos, bPos);
     Value compareRes =
         rewriter.create<arith::OrIOp>(loc, firstTerm, secondTerm);
     // Perform substitution and return success.
     rewriter.replaceOpWithNewOp<arith::SelectOp>(op, compareRes, posRes,
                                                  negRes);
     return success();
   }
 };

 /// Expands FloorDivSIOp (n, m) into
 ///   1)  x = (m<0) ? 1 : -1
 ///   2)  return (n*m<0) ? - ((-n+x) / m) -1 : n / m
 struct FloorDivSIOpConverter : public OpRewritePattern<arith::FloorDivSIOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(arith::FloorDivSIOp op,
                                 PatternRewriter &rewriter) const final {
     Location loc = op.getLoc();
     Type type = op.getType();
     Value a = op.getLhs();
     Value b = op.getRhs();
     Value plusOne = createConst(loc, type, 1, rewriter);
     Value zero = createConst(loc, type, 0, rewriter);
     Value minusOne = createConst(loc, type, -1, rewriter);
     // Compute x = (b<0) ? 1 : -1.
     Value compare =
         rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt, b, zero);
     Value x = rewriter.create<arith::SelectOp>(loc, compare, plusOne, minusOne);
     // Compute negative res: -1 - ((x-a)/b).
     Value xMinusA = rewriter.create<arith::SubIOp>(loc, x, a);
     Value xMinusADivB = rewriter.create<arith::DivSIOp>(loc, xMinusA, b);
     Value negRes = rewriter.create<arith::SubIOp>(loc, minusOne, xMinusADivB);
     // Compute positive res: a/b.
     Value posRes = rewriter.create<arith::DivSIOp>(loc, a, b);
     // Result is (a*b<0) ? negative result : positive result.
     // Note, we want to avoid using a*b because of possible overflow.
     // The case that matters are a>0, a==0, a<0, b>0 and b<0. We do
     // not particuliarly care if a*b<0 is true or false when b is zero
     // as this will result in an illegal divide. So `a*b<0` can be reformulated
     // as `(a>0 && b<0) || (a>0 && b<0)' or `(a>0 && b<0) || (a>0 && b<=0)'.
     // We pick the first expression here.
     Value aNeg =
         rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt, a, zero);
     Value aPos =
         rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sgt, a, zero);
     Value bNeg =
         rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt, b, zero);
     Value bPos =
         rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sgt, b, zero);
     Value firstTerm = rewriter.create<arith::AndIOp>(loc, aNeg, bPos);
     Value secondTerm = rewriter.create<arith::AndIOp>(loc, aPos, bNeg);
     Value compareRes =
         rewriter.create<arith::OrIOp>(loc, firstTerm, secondTerm);
     // Perform substitution and return success.
     rewriter.replaceOpWithNewOp<arith::SelectOp>(op, compareRes, negRes,
                                                  posRes);
     return success();
   }
 };

 template <typename OpTy, arith::CmpFPredicate pred>
 struct MaximumMinimumFOpConverter : public OpRewritePattern<OpTy> {
 public:
   using OpRewritePattern<OpTy>::OpRewritePattern;

   LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const final {
     Value lhs = op.getLhs();
     Value rhs = op.getRhs();

     Location loc = op.getLoc();
     // If any operand is NaN, 'cmp' will be true (and 'select' returns 'lhs').
     static_assert(pred == arith::CmpFPredicate::UGT ||
                       pred == arith::CmpFPredicate::ULT,
                   "pred must be either UGT or ULT");
     Value cmp = rewriter.create<arith::CmpFOp>(loc, pred, lhs, rhs);
     Value select = rewriter.create<arith::SelectOp>(loc, cmp, lhs, rhs);

     // Handle the case where rhs is NaN: 'isNaN(rhs) ? rhs : select'.
     Value isNaN = rewriter.create<arith::CmpFOp>(loc, arith::CmpFPredicate::UNO,
                                                  rhs, rhs);
     rewriter.replaceOpWithNewOp<arith::SelectOp>(op, isNaN, rhs, select);
     return success();
   }
 };

 template <typename OpTy, arith::CmpFPredicate pred>
 struct MaxNumMinNumFOpConverter : public OpRewritePattern<OpTy> {
 public:
   using OpRewritePattern<OpTy>::OpRewritePattern;

   LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const final {
     Value lhs = op.getLhs();
     Value rhs = op.getRhs();

     Location loc = op.getLoc();
     // If any operand is NaN, 'cmp' will be true (and 'select' returns 'lhs').
     static_assert(pred == arith::CmpFPredicate::UGT ||
                       pred == arith::CmpFPredicate::ULT,
                   "pred must be either UGT or ULT");
     Value cmp = rewriter.create<arith::CmpFOp>(loc, pred, lhs, rhs);
     Value select = rewriter.create<arith::SelectOp>(loc, cmp, lhs, rhs);

     // Handle the case where lhs is NaN: 'isNaN(lhs) ? rhs : select'.
     Value isNaN = rewriter.create<arith::CmpFOp>(loc, arith::CmpFPredicate::UNO,
                                                  lhs, lhs);
     rewriter.replaceOpWithNewOp<arith::SelectOp>(op, isNaN, rhs, select);
     return success();
   }
 };

 struct BFloat16ExtFOpConverter : public OpRewritePattern<arith::ExtFOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(arith::ExtFOp op,
                                 PatternRewriter &rewriter) const final {
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
     auto operand = op.getOperand();
     Type operandTy = operand.getType();
     Type resultTy = op.getType();
     Type operandETy = getElementTypeOrSelf(operandTy);
     Type resultETy = getElementTypeOrSelf(resultTy);

     if (!operandETy.isBF16() || !resultETy.isF32()) {
       return rewriter.notifyMatchFailure(op, "not a ext of bf16 to f32.");
     }

     Type i16Ty = b.getI16Type();
     Type i32Ty = b.getI32Type();
     if (auto shapedTy = dyn_cast<ShapedType>(operandTy)) {
       i16Ty = shapedTy.clone(i16Ty);
       i32Ty = shapedTy.clone(i32Ty);
     }

     Value bitcast = b.create<arith::BitcastOp>(i16Ty, operand);
     Value exti = b.create<arith::ExtUIOp>(i32Ty, bitcast);

     Value c16 = createConst(op.getLoc(), i32Ty, 16, rewriter);
     Value shl = b.create<arith::ShLIOp>(exti, c16);
     Value result = b.create<arith::BitcastOp>(resultTy, shl);

     rewriter.replaceOp(op, result);
     return success();
   }
 };

 struct BFloat16TruncFOpConverter : public OpRewritePattern<arith::TruncFOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(arith::TruncFOp op,
                                 PatternRewriter &rewriter) const final {
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
     auto operand = op.getOperand();
     Type operandTy = operand.getType();
     Type resultTy = op.getType();
     Type operandETy = getElementTypeOrSelf(operandTy);
     Type resultETy = getElementTypeOrSelf(resultTy);

     if (!operandETy.isF32() || !resultETy.isBF16()) {
       return rewriter.notifyMatchFailure(op, "not a trunc of f32 to bf16.");
     }

     Type i1Ty = b.getI1Type();
     Type i16Ty = b.getI16Type();
     Type i32Ty = b.getI32Type();
     Type f32Ty = b.getF32Type();
     if (auto shapedTy = dyn_cast<ShapedType>(operandTy)) {
       i1Ty = shapedTy.clone(i1Ty);
       i16Ty = shapedTy.clone(i16Ty);
       i32Ty = shapedTy.clone(i32Ty);
       f32Ty = shapedTy.clone(f32Ty);
     }

     Value bitcast = b.create<arith::BitcastOp>(i32Ty, operand);

     Value c23 = createConst(op.getLoc(), i32Ty, 23, rewriter);
     Value c31 = createConst(op.getLoc(), i32Ty, 31, rewriter);
     Value c23Mask = createConst(op.getLoc(), i32Ty, (1 << 23) - 1, rewriter);
     Value expMask =
         createConst(op.getLoc(), i32Ty, ((1 << 8) - 1) << 23, rewriter);
     Value expMax =
         createConst(op.getLoc(), i32Ty, ((1 << 8) - 2) << 23, rewriter);

     // Grab the sign bit.
     Value sign = b.create<arith::ShRUIOp>(bitcast, c31);

     // Our mantissa rounding value depends on the sign bit and the last
     // truncated bit.
     Value cManRound = createConst(op.getLoc(), i32Ty, (1 << 15), rewriter);
     cManRound = b.create<arith::SubIOp>(cManRound, sign);

     // Grab out the mantissa and directly apply rounding.
     Value man = b.create<arith::AndIOp>(bitcast, c23Mask);
     Value manRound = b.create<arith::AddIOp>(man, cManRound);

     // Grab the overflow bit and shift right if we overflow.
     Value roundBit = b.create<arith::ShRUIOp>(manRound, c23);
     Value manNew = b.create<arith::ShRUIOp>(manRound, roundBit);

     // Grab the exponent and round using the mantissa's carry bit.
     Value exp = b.create<arith::AndIOp>(bitcast, expMask);
     Value expCarry = b.create<arith::AddIOp>(exp, manRound);
     expCarry = b.create<arith::AndIOp>(expCarry, expMask);

     // If the exponent is saturated, we keep the max value.
     Value expCmp =
         b.create<arith::CmpIOp>(arith::CmpIPredicate::uge, exp, expMax);
     exp = b.create<arith::SelectOp>(expCmp, exp, expCarry);

     // If the exponent is max and we rolled over, keep the old mantissa.
     Value roundBitBool = b.create<arith::TruncIOp>(i1Ty, roundBit);
     Value keepOldMan = b.create<arith::AndIOp>(expCmp, roundBitBool);
     man = b.create<arith::SelectOp>(keepOldMan, man, manNew);

     // Assemble the now rounded f32 value (as an i32).
     Value rounded = b.create<arith::ShLIOp>(sign, c31);
     rounded = b.create<arith::OrIOp>(rounded, exp);
     rounded = b.create<arith::OrIOp>(rounded, man);

     Value c16 = createConst(op.getLoc(), i32Ty, 16, rewriter);
     Value shr = b.create<arith::ShRUIOp>(rounded, c16);
     Value trunc = b.create<arith::TruncIOp>(i16Ty, shr);
     Value result = b.create<arith::BitcastOp>(resultTy, trunc);

     rewriter.replaceOp(op, result);
     return success();
   }
 };

 struct ArithExpandOpsPass
     : public arith::impl::ArithExpandOpsPassBase<ArithExpandOpsPass> {
   using ArithExpandOpsPassBase::ArithExpandOpsPassBase;

   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
     ConversionTarget target(getContext());

     arith::populateArithExpandOpsPatterns(patterns);

     target.addLegalDialect<arith::ArithDialect>();
     // clang-format off
     target.addIllegalOp<
       arith::CeilDivSIOp,
       arith::CeilDivUIOp,
       arith::FloorDivSIOp,
       arith::MaximumFOp,
       arith::MinimumFOp,
       arith::MaxNumFOp,
       arith::MinNumFOp
     >();

     if (includeBf16) {
       arith::populateExpandBFloat16Patterns(patterns);
       target.addDynamicallyLegalOp<arith::ExtFOp>(
         [](arith::ExtFOp op) {
           Type inETy = getElementTypeOrSelf(op.getOperand().getType());
           Type outETy = getElementTypeOrSelf(op.getType());
           return !(inETy.isBF16() && outETy.isF32());
         });

       target.addDynamicallyLegalOp<arith::TruncFOp>(
         [](arith::TruncFOp op)  {
           Type inETy = getElementTypeOrSelf(op.getOperand().getType());
           Type outETy = getElementTypeOrSelf(op.getType());
           return !(inETy.isF32() && outETy.isBF16());
         });
     }

     // clang-format on
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns))))
       signalPassFailure();
   }
 };

 } // namespace

 void mlir::arith::populateCeilFloorDivExpandOpsPatterns(
     RewritePatternSet &patterns) {
   patterns
       .add<CeilDivSIOpConverter, CeilDivUIOpConverter, FloorDivSIOpConverter>(
           patterns.getContext());
 }

 void mlir::arith::populateExpandBFloat16Patterns(RewritePatternSet &patterns) {
   patterns.add<BFloat16ExtFOpConverter, BFloat16TruncFOpConverter>(
       patterns.getContext());
 }

 void mlir::arith::populateArithExpandOpsPatterns(RewritePatternSet &patterns) {
   populateCeilFloorDivExpandOpsPatterns(patterns);
   // clang-format off
   patterns.add<
     MaximumMinimumFOpConverter<MaximumFOp, arith::CmpFPredicate::UGT>,
     MaximumMinimumFOpConverter<MinimumFOp, arith::CmpFPredicate::ULT>,
     MaxNumMinNumFOpConverter<MaxNumFOp, arith::CmpFPredicate::UGT>,
     MaxNumMinNumFOpConverter<MinNumFOp, arith::CmpFPredicate::ULT>
    >(patterns.getContext());
   // clang-format on
 }
	//===- ExpandOps.cpp - Pass to legalize Arith ops for LLVM lowering --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "mlir/Dialect/Arith/Transforms/Passes.h"

	#include "mlir/Dialect/Arith/IR/Arith.h"
	#include "mlir/Dialect/Vector/IR/VectorOps.h"
	#include "mlir/IR/ImplicitLocOpBuilder.h"
	#include "mlir/IR/TypeUtilities.h"
	#include "mlir/Transforms/DialectConversion.h"

	namespace mlir {
	namespace arith {
	#define GEN_PASS_DEF_ARITHEXPANDOPSPASS
	#include "mlir/Dialect/Arith/Transforms/Passes.h.inc"
	} // namespace arith
	} // namespace mlir

	using namespace mlir;

	/// Create an integer or index constant.
	static Value createConst(Location loc, Type type, int value,
	PatternRewriter &rewriter) {
	auto attr = rewriter.getIntegerAttr(getElementTypeOrSelf(type), value);
	if (auto shapedTy = dyn_cast<ShapedType>(type)) {
	return rewriter.create<arith::ConstantOp>(
	loc, DenseElementsAttr::get(shapedTy, attr));
	}

	return rewriter.create<arith::ConstantOp>(loc, attr);
	}

	namespace {

	/// Expands CeilDivUIOp (n, m) into
	/// n == 0 ? 0 : ((n-1) / m) + 1
	struct CeilDivUIOpConverter : public OpRewritePattern<arith::CeilDivUIOp> {
	using OpRewritePattern::OpRewritePattern;
	LogicalResult matchAndRewrite(arith::CeilDivUIOp op,
	PatternRewriter &rewriter) const final {
	Location loc = op.getLoc();
	Value a = op.getLhs();
	Value b = op.getRhs();
	Value zero = createConst(loc, a.getType(), 0, rewriter);
	Value compare =
	rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, a, zero);
	Value one = createConst(loc, a.getType(), 1, rewriter);
	Value minusOne = rewriter.create<arith::SubIOp>(loc, a, one);
	Value quotient = rewriter.create<arith::DivUIOp>(loc, minusOne, b);
	Value plusOne = rewriter.create<arith::AddIOp>(loc, quotient, one);
	rewriter.replaceOpWithNewOp<arith::SelectOp>(op, compare, zero, plusOne);
	return success();
	}
	};

	/// Expands CeilDivSIOp (n, m) into
	/// 1) x = (m > 0) ? -1 : 1
	/// 2) (n*m>0) ? ((n+x) / m) + 1 : - (-n / m)
	struct CeilDivSIOpConverter : public OpRewritePattern<arith::CeilDivSIOp> {
	using OpRewritePattern::OpRewritePattern;
	LogicalResult matchAndRewrite(arith::CeilDivSIOp op,
	PatternRewriter &rewriter) const final {
	Location loc = op.getLoc();
	Type type = op.getType();
	Value a = op.getLhs();
	Value b = op.getRhs();
	Value plusOne = createConst(loc, type, 1, rewriter);
	Value zero = createConst(loc, type, 0, rewriter);
	Value minusOne = createConst(loc, type, -1, rewriter);
	// Compute x = (b>0) ? -1 : 1.
	Value compare =
	rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sgt, b, zero);
	Value x = rewriter.create<arith::SelectOp>(loc, compare, minusOne, plusOne);
	// Compute positive res: 1 + ((x+a)/b).
	Value xPlusA = rewriter.create<arith::AddIOp>(loc, x, a);
	Value xPlusADivB = rewriter.create<arith::DivSIOp>(loc, xPlusA, b);
	Value posRes = rewriter.create<arith::AddIOp>(loc, plusOne, xPlusADivB);
	// Compute negative res: - ((-a)/b).
	Value minusA = rewriter.create<arith::SubIOp>(loc, zero, a);
	Value minusADivB = rewriter.create<arith::DivSIOp>(loc, minusA, b);
	Value negRes = rewriter.create<arith::SubIOp>(loc, zero, minusADivB);
	// Result is (a*b>0) ? pos result : neg result.
	// Note, we want to avoid using a*b because of possible overflow.
	// The case that matters are a>0, a==0, a<0, b>0 and b<0. We do
	// not particuliarly care if a*b<0 is true or false when b is zero
	// as this will result in an illegal divide. So `a*b<0` can be reformulated
	// as `(a<0 && b<0) \|\| (a>0 && b>0)' or `(a<0 && b<0) \|\| (a>0 && b>=0)'.
	// We pick the first expression here.
	Value aNeg =
	rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt, a, zero);
	Value aPos =
	rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sgt, a, zero);
	Value bNeg =
	rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt, b, zero);
	Value bPos =
	rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sgt, b, zero);
	Value firstTerm = rewriter.create<arith::AndIOp>(loc, aNeg, bNeg);
	Value secondTerm = rewriter.create<arith::AndIOp>(loc, aPos, bPos);
	Value compareRes =
	rewriter.create<arith::OrIOp>(loc, firstTerm, secondTerm);
	// Perform substitution and return success.
	rewriter.replaceOpWithNewOp<arith::SelectOp>(op, compareRes, posRes,
	negRes);
	return success();
	}
	};

	/// Expands FloorDivSIOp (n, m) into
	/// 1) x = (m<0) ? 1 : -1
	/// 2) return (n*m<0) ? - ((-n+x) / m) -1 : n / m
	struct FloorDivSIOpConverter : public OpRewritePattern<arith::FloorDivSIOp> {
	using OpRewritePattern::OpRewritePattern;
	LogicalResult matchAndRewrite(arith::FloorDivSIOp op,
	PatternRewriter &rewriter) const final {
	Location loc = op.getLoc();
	Type type = op.getType();
	Value a = op.getLhs();
	Value b = op.getRhs();
	Value plusOne = createConst(loc, type, 1, rewriter);
	Value zero = createConst(loc, type, 0, rewriter);
	Value minusOne = createConst(loc, type, -1, rewriter);
	// Compute x = (b<0) ? 1 : -1.
	Value compare =
	rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt, b, zero);
	Value x = rewriter.create<arith::SelectOp>(loc, compare, plusOne, minusOne);
	// Compute negative res: -1 - ((x-a)/b).
	Value xMinusA = rewriter.create<arith::SubIOp>(loc, x, a);
	Value xMinusADivB = rewriter.create<arith::DivSIOp>(loc, xMinusA, b);
	Value negRes = rewriter.create<arith::SubIOp>(loc, minusOne, xMinusADivB);
	// Compute positive res: a/b.
	Value posRes = rewriter.create<arith::DivSIOp>(loc, a, b);
	// Result is (a*b<0) ? negative result : positive result.
	// Note, we want to avoid using a*b because of possible overflow.
	// The case that matters are a>0, a==0, a<0, b>0 and b<0. We do
	// not particuliarly care if a*b<0 is true or false when b is zero
	// as this will result in an illegal divide. So `a*b<0` can be reformulated
	// as `(a>0 && b<0) \|\| (a>0 && b<0)' or `(a>0 && b<0) \|\| (a>0 && b<=0)'.
	// We pick the first expression here.
	Value aNeg =
	rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt, a, zero);
	Value aPos =
	rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sgt, a, zero);
	Value bNeg =
	rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt, b, zero);
	Value bPos =
	rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sgt, b, zero);
	Value firstTerm = rewriter.create<arith::AndIOp>(loc, aNeg, bPos);
	Value secondTerm = rewriter.create<arith::AndIOp>(loc, aPos, bNeg);
	Value compareRes =
	rewriter.create<arith::OrIOp>(loc, firstTerm, secondTerm);
	// Perform substitution and return success.
	rewriter.replaceOpWithNewOp<arith::SelectOp>(op, compareRes, negRes,
	posRes);
	return success();
	}
	};

	template <typename OpTy, arith::CmpFPredicate pred>
	struct MaximumMinimumFOpConverter : public OpRewritePattern<OpTy> {
	public:
	using OpRewritePattern<OpTy>::OpRewritePattern;

	LogicalResult matchAndRewrite(OpTy op,
	PatternRewriter &rewriter) const final {
	Value lhs = op.getLhs();
	Value rhs = op.getRhs();

	Location loc = op.getLoc();
	// If any operand is NaN, 'cmp' will be true (and 'select' returns 'lhs').
	static_assert(pred == arith::CmpFPredicate::UGT \|\|
	pred == arith::CmpFPredicate::ULT,
	"pred must be either UGT or ULT");
	Value cmp = rewriter.create<arith::CmpFOp>(loc, pred, lhs, rhs);
	Value select = rewriter.create<arith::SelectOp>(loc, cmp, lhs, rhs);

	// Handle the case where rhs is NaN: 'isNaN(rhs) ? rhs : select'.
	Value isNaN = rewriter.create<arith::CmpFOp>(loc, arith::CmpFPredicate::UNO,
	rhs, rhs);
	rewriter.replaceOpWithNewOp<arith::SelectOp>(op, isNaN, rhs, select);
	return success();
	}
	};

	template <typename OpTy, arith::CmpFPredicate pred>
	struct MaxNumMinNumFOpConverter : public OpRewritePattern<OpTy> {
	public:
	using OpRewritePattern<OpTy>::OpRewritePattern;

	LogicalResult matchAndRewrite(OpTy op,
	PatternRewriter &rewriter) const final {
	Value lhs = op.getLhs();
	Value rhs = op.getRhs();

	Location loc = op.getLoc();
	// If any operand is NaN, 'cmp' will be true (and 'select' returns 'lhs').
	static_assert(pred == arith::CmpFPredicate::UGT \|\|
	pred == arith::CmpFPredicate::ULT,
	"pred must be either UGT or ULT");
	Value cmp = rewriter.create<arith::CmpFOp>(loc, pred, lhs, rhs);
	Value select = rewriter.create<arith::SelectOp>(loc, cmp, lhs, rhs);

	// Handle the case where lhs is NaN: 'isNaN(lhs) ? rhs : select'.
	Value isNaN = rewriter.create<arith::CmpFOp>(loc, arith::CmpFPredicate::UNO,
	lhs, lhs);
	rewriter.replaceOpWithNewOp<arith::SelectOp>(op, isNaN, rhs, select);
	return success();
	}
	};

	struct BFloat16ExtFOpConverter : public OpRewritePattern<arith::ExtFOp> {
	using OpRewritePattern::OpRewritePattern;
	LogicalResult matchAndRewrite(arith::ExtFOp op,
	PatternRewriter &rewriter) const final {
	ImplicitLocOpBuilder b(op.getLoc(), rewriter);
	auto operand = op.getOperand();
	Type operandTy = operand.getType();
	Type resultTy = op.getType();
	Type operandETy = getElementTypeOrSelf(operandTy);
	Type resultETy = getElementTypeOrSelf(resultTy);

	if (!operandETy.isBF16() \|\| !resultETy.isF32()) {
	return rewriter.notifyMatchFailure(op, "not a ext of bf16 to f32.");
	}

	Type i16Ty = b.getI16Type();
	Type i32Ty = b.getI32Type();
	if (auto shapedTy = dyn_cast<ShapedType>(operandTy)) {
	i16Ty = shapedTy.clone(i16Ty);
	i32Ty = shapedTy.clone(i32Ty);
	}

	Value bitcast = b.create<arith::BitcastOp>(i16Ty, operand);
	Value exti = b.create<arith::ExtUIOp>(i32Ty, bitcast);

	Value c16 = createConst(op.getLoc(), i32Ty, 16, rewriter);
	Value shl = b.create<arith::ShLIOp>(exti, c16);
	Value result = b.create<arith::BitcastOp>(resultTy, shl);

	rewriter.replaceOp(op, result);
	return success();
	}
	};

	struct BFloat16TruncFOpConverter : public OpRewritePattern<arith::TruncFOp> {
	using OpRewritePattern::OpRewritePattern;
	LogicalResult matchAndRewrite(arith::TruncFOp op,
	PatternRewriter &rewriter) const final {
	ImplicitLocOpBuilder b(op.getLoc(), rewriter);
	auto operand = op.getOperand();
	Type operandTy = operand.getType();
	Type resultTy = op.getType();
	Type operandETy = getElementTypeOrSelf(operandTy);
	Type resultETy = getElementTypeOrSelf(resultTy);

	if (!operandETy.isF32() \|\| !resultETy.isBF16()) {
	return rewriter.notifyMatchFailure(op, "not a trunc of f32 to bf16.");
	}

	Type i1Ty = b.getI1Type();
	Type i16Ty = b.getI16Type();
	Type i32Ty = b.getI32Type();
	Type f32Ty = b.getF32Type();
	if (auto shapedTy = dyn_cast<ShapedType>(operandTy)) {
	i1Ty = shapedTy.clone(i1Ty);
	i16Ty = shapedTy.clone(i16Ty);
	i32Ty = shapedTy.clone(i32Ty);
	f32Ty = shapedTy.clone(f32Ty);
	}

	Value bitcast = b.create<arith::BitcastOp>(i32Ty, operand);

	Value c23 = createConst(op.getLoc(), i32Ty, 23, rewriter);
	Value c31 = createConst(op.getLoc(), i32Ty, 31, rewriter);
	Value c23Mask = createConst(op.getLoc(), i32Ty, (1 << 23) - 1, rewriter);
	Value expMask =
	createConst(op.getLoc(), i32Ty, ((1 << 8) - 1) << 23, rewriter);
	Value expMax =
	createConst(op.getLoc(), i32Ty, ((1 << 8) - 2) << 23, rewriter);

	// Grab the sign bit.
	Value sign = b.create<arith::ShRUIOp>(bitcast, c31);

	// Our mantissa rounding value depends on the sign bit and the last
	// truncated bit.
	Value cManRound = createConst(op.getLoc(), i32Ty, (1 << 15), rewriter);
	cManRound = b.create<arith::SubIOp>(cManRound, sign);

	// Grab out the mantissa and directly apply rounding.
	Value man = b.create<arith::AndIOp>(bitcast, c23Mask);
	Value manRound = b.create<arith::AddIOp>(man, cManRound);

	// Grab the overflow bit and shift right if we overflow.
	Value roundBit = b.create<arith::ShRUIOp>(manRound, c23);
	Value manNew = b.create<arith::ShRUIOp>(manRound, roundBit);

	// Grab the exponent and round using the mantissa's carry bit.
	Value exp = b.create<arith::AndIOp>(bitcast, expMask);
	Value expCarry = b.create<arith::AddIOp>(exp, manRound);
	expCarry = b.create<arith::AndIOp>(expCarry, expMask);

	// If the exponent is saturated, we keep the max value.
	Value expCmp =
	b.create<arith::CmpIOp>(arith::CmpIPredicate::uge, exp, expMax);
	exp = b.create<arith::SelectOp>(expCmp, exp, expCarry);

	// If the exponent is max and we rolled over, keep the old mantissa.
	Value roundBitBool = b.create<arith::TruncIOp>(i1Ty, roundBit);
	Value keepOldMan = b.create<arith::AndIOp>(expCmp, roundBitBool);
	man = b.create<arith::SelectOp>(keepOldMan, man, manNew);

	// Assemble the now rounded f32 value (as an i32).
	Value rounded = b.create<arith::ShLIOp>(sign, c31);
	rounded = b.create<arith::OrIOp>(rounded, exp);
	rounded = b.create<arith::OrIOp>(rounded, man);

	Value c16 = createConst(op.getLoc(), i32Ty, 16, rewriter);
	Value shr = b.create<arith::ShRUIOp>(rounded, c16);
	Value trunc = b.create<arith::TruncIOp>(i16Ty, shr);
	Value result = b.create<arith::BitcastOp>(resultTy, trunc);

	rewriter.replaceOp(op, result);
	return success();
	}
	};

	struct ArithExpandOpsPass
	: public arith::impl::ArithExpandOpsPassBase<ArithExpandOpsPass> {
	using ArithExpandOpsPassBase::ArithExpandOpsPassBase;

	void runOnOperation() override {
	RewritePatternSet patterns(&getContext());
	ConversionTarget target(getContext());

	arith::populateArithExpandOpsPatterns(patterns);

	target.addLegalDialect<arith::ArithDialect>();
	// clang-format off
	target.addIllegalOp<
	arith::CeilDivSIOp,
	arith::CeilDivUIOp,
	arith::FloorDivSIOp,
	arith::MaximumFOp,
	arith::MinimumFOp,
	arith::MaxNumFOp,
	arith::MinNumFOp
	>();

	if (includeBf16) {
	arith::populateExpandBFloat16Patterns(patterns);
	target.addDynamicallyLegalOp<arith::ExtFOp>(
	[](arith::ExtFOp op) {
	Type inETy = getElementTypeOrSelf(op.getOperand().getType());
	Type outETy = getElementTypeOrSelf(op.getType());
	return !(inETy.isBF16() && outETy.isF32());
	});

	target.addDynamicallyLegalOp<arith::TruncFOp>(
	[](arith::TruncFOp op) {
	Type inETy = getElementTypeOrSelf(op.getOperand().getType());
	Type outETy = getElementTypeOrSelf(op.getType());
	return !(inETy.isF32() && outETy.isBF16());
	});
	}

	// clang-format on
	if (failed(applyPartialConversion(getOperation(), target,
	std::move(patterns))))
	signalPassFailure();
	}
	};

	} // namespace

	void mlir::arith::populateCeilFloorDivExpandOpsPatterns(
	RewritePatternSet &patterns) {
	patterns
	.add<CeilDivSIOpConverter, CeilDivUIOpConverter, FloorDivSIOpConverter>(
	patterns.getContext());
	}

	void mlir::arith::populateExpandBFloat16Patterns(RewritePatternSet &patterns) {
	patterns.add<BFloat16ExtFOpConverter, BFloat16TruncFOpConverter>(
	patterns.getContext());
	}

	void mlir::arith::populateArithExpandOpsPatterns(RewritePatternSet &patterns) {
	populateCeilFloorDivExpandOpsPatterns(patterns);
	// clang-format off
	patterns.add<
	MaximumMinimumFOpConverter<MaximumFOp, arith::CmpFPredicate::UGT>,
	MaximumMinimumFOpConverter<MinimumFOp, arith::CmpFPredicate::ULT>,
	MaxNumMinNumFOpConverter<MaxNumFOp, arith::CmpFPredicate::UGT>,
	MaxNumMinNumFOpConverter<MinNumFOp, arith::CmpFPredicate::ULT>
	>(patterns.getContext());
	// clang-format on
	}