mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp - llvm-project - Git at Google

 //===- EmulateWideInt.cpp - Wide integer operation emulation ----*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "mlir/Dialect/Arith/Transforms/Passes.h"

 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Transforms/WideIntEmulationConverter.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Func/Transforms/FuncConversions.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>

 namespace mlir::arith {
 #define GEN_PASS_DEF_ARITHEMULATEWIDEINT
 #include "mlir/Dialect/Arith/Transforms/Passes.h.inc"
 } // namespace mlir::arith

 using namespace mlir;

 //===----------------------------------------------------------------------===//
 // Common Helper Functions
 //===----------------------------------------------------------------------===//

 /// Returns N bottom and N top bits from `value`, where N = `newBitWidth`.
 /// Treats `value` as a 2*N bits-wide integer.
 /// The bottom bits are returned in the first pair element, while the top bits
 /// in the second one.
 static std::pair<APInt, APInt> getHalves(const APInt &value,
                                          unsigned newBitWidth) {
   APInt low = value.extractBits(newBitWidth, 0);
   APInt high = value.extractBits(newBitWidth, newBitWidth);
   return {std::move(low), std::move(high)};
 }

 /// Returns the type with the last (innermost) dimension reduced to x1.
 /// Scalarizes 1D vector inputs to match how we extract/insert vector values,
 /// e.g.:
 ///   - vector<3x2xi16> --> vector<3x1xi16>
 ///   - vector<2xi16>   --> i16
 static Type reduceInnermostDim(VectorType type) {
   if (type.getShape().size() == 1)
     return type.getElementType();

   auto newShape = to_vector(type.getShape());
   newShape.back() = 1;
   return VectorType::get(newShape, type.getElementType());
 }

 /// Extracts the `input` vector slice with elements at the last dimension offset
 /// by `lastOffset`. Returns a value of vector type with the last dimension
 /// reduced to x1 or fully scalarized, e.g.:
 ///   - vector<3x2xi16> --> vector<3x1xi16>
 ///   - vector<2xi16>   --> i16
 static Value extractLastDimSlice(ConversionPatternRewriter &rewriter,
                                  Location loc, Value input,
                                  int64_t lastOffset) {
   ArrayRef<int64_t> shape = cast<VectorType>(input.getType()).getShape();
   assert(lastOffset < shape.back() && "Offset out of bounds");

   // Scalarize the result in case of 1D vectors.
   if (shape.size() == 1)
     return rewriter.create<vector::ExtractOp>(loc, input, lastOffset);

   SmallVector<int64_t> offsets(shape.size(), 0);
   offsets.back() = lastOffset;
   auto sizes = llvm::to_vector(shape);
   sizes.back() = 1;
   SmallVector<int64_t> strides(shape.size(), 1);

   return rewriter.create<vector::ExtractStridedSliceOp>(loc, input, offsets,
                                                         sizes, strides);
 }

 /// Extracts two vector slices from the `input` whose type is `vector<...x2T>`,
 /// with the first element at offset 0 and the second element at offset 1.
 static std::pair<Value, Value>
 extractLastDimHalves(ConversionPatternRewriter &rewriter, Location loc,
                      Value input) {
   return {extractLastDimSlice(rewriter, loc, input, 0),
           extractLastDimSlice(rewriter, loc, input, 1)};
 }

 // Performs a vector shape cast to drop the trailing x1 dimension. If the
 // `input` is a scalar, this is a noop.
 static Value dropTrailingX1Dim(ConversionPatternRewriter &rewriter,
                                Location loc, Value input) {
   auto vecTy = dyn_cast<VectorType>(input.getType());
   if (!vecTy)
     return input;

   // Shape cast to drop the last x1 dimension.
   ArrayRef<int64_t> shape = vecTy.getShape();
   assert(shape.size() >= 2 && "Expected vector with at list two dims");
   assert(shape.back() == 1 && "Expected the last vector dim to be x1");

   auto newVecTy = VectorType::get(shape.drop_back(), vecTy.getElementType());
   return rewriter.create<vector::ShapeCastOp>(loc, newVecTy, input);
 }

 /// Performs a vector shape cast to append an x1 dimension. If the
 /// `input` is a scalar, this is a noop.
 static Value appendX1Dim(ConversionPatternRewriter &rewriter, Location loc,
                          Value input) {
   auto vecTy = dyn_cast<VectorType>(input.getType());
   if (!vecTy)
     return input;

   // Add a trailing x1 dim.
   auto newShape = llvm::to_vector(vecTy.getShape());
   newShape.push_back(1);
   auto newTy = VectorType::get(newShape, vecTy.getElementType());
   return rewriter.create<vector::ShapeCastOp>(loc, newTy, input);
 }

 /// Inserts the `source` vector slice into the `dest` vector at offset
 /// `lastOffset` in the last dimension. `source` can be a scalar when `dest` is
 /// a 1D vector.
 static Value insertLastDimSlice(ConversionPatternRewriter &rewriter,
                                 Location loc, Value source, Value dest,
                                 int64_t lastOffset) {
   ArrayRef<int64_t> shape = cast<VectorType>(dest.getType()).getShape();
   assert(lastOffset < shape.back() && "Offset out of bounds");

   // Handle scalar source.
   if (isa<IntegerType>(source.getType()))
     return rewriter.create<vector::InsertOp>(loc, source, dest, lastOffset);

   SmallVector<int64_t> offsets(shape.size(), 0);
   offsets.back() = lastOffset;
   SmallVector<int64_t> strides(shape.size(), 1);
   return rewriter.create<vector::InsertStridedSliceOp>(loc, source, dest,
                                                        offsets, strides);
 }

 /// Constructs a new vector of type `resultType` by creating a series of
 /// insertions of `resultComponents`, each at the next offset of the last vector
 /// dimension.
 /// When all `resultComponents` are scalars, the result type is `vector<NxT>`;
 /// when `resultComponents` are `vector<...x1xT>`s, the result type is
 /// `vector<...xNxT>`, where `N` is the number of `resultComponents`.
 static Value constructResultVector(ConversionPatternRewriter &rewriter,
                                    Location loc, VectorType resultType,
                                    ValueRange resultComponents) {
   llvm::ArrayRef<int64_t> resultShape = resultType.getShape();
   (void)resultShape;
   assert(!resultShape.empty() && "Result expected to have dimensions");
   assert(resultShape.back() == static_cast<int64_t>(resultComponents.size()) &&
          "Wrong number of result components");

   Value resultVec = createScalarOrSplatConstant(rewriter, loc, resultType, 0);
   for (auto [i, component] : llvm::enumerate(resultComponents))
     resultVec = insertLastDimSlice(rewriter, loc, component, resultVec, i);

   return resultVec;
 }

 namespace {
 //===----------------------------------------------------------------------===//
 // ConvertConstant
 //===----------------------------------------------------------------------===//

 struct ConvertConstant final : OpConversionPattern<arith::ConstantOp> {
   using OpConversionPattern::OpConversionPattern;

   LogicalResult
   matchAndRewrite(arith::ConstantOp op, OpAdaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Type oldType = op.getType();
     auto newType = getTypeConverter()->convertType<VectorType>(oldType);
     if (!newType)
       return rewriter.notifyMatchFailure(
           op, llvm::formatv("unsupported type: {0}", op.getType()));

     unsigned newBitWidth = newType.getElementTypeBitWidth();
     Attribute oldValue = op.getValueAttr();

     if (auto intAttr = dyn_cast<IntegerAttr>(oldValue)) {
       auto [low, high] = getHalves(intAttr.getValue(), newBitWidth);
       auto newAttr = DenseElementsAttr::get(newType, {low, high});
       rewriter.replaceOpWithNewOp<arith::ConstantOp>(op, newAttr);
       return success();
     }

     if (auto splatAttr = dyn_cast<SplatElementsAttr>(oldValue)) {
       auto [low, high] =
           getHalves(splatAttr.getSplatValue<APInt>(), newBitWidth);
       int64_t numSplatElems = splatAttr.getNumElements();
       SmallVector<APInt> values;
       values.reserve(numSplatElems * 2);
       for (int64_t i = 0; i < numSplatElems; ++i) {
         values.push_back(low);
         values.push_back(high);
       }

       auto attr = DenseElementsAttr::get(newType, values);
       rewriter.replaceOpWithNewOp<arith::ConstantOp>(op, attr);
       return success();
     }

     if (auto elemsAttr = dyn_cast<DenseElementsAttr>(oldValue)) {
       int64_t numElems = elemsAttr.getNumElements();
       SmallVector<APInt> values;
       values.reserve(numElems * 2);
       for (const APInt &origVal : elemsAttr.getValues<APInt>()) {
         auto [low, high] = getHalves(origVal, newBitWidth);
         values.push_back(std::move(low));
         values.push_back(std::move(high));
       }

       auto attr = DenseElementsAttr::get(newType, values);
       rewriter.replaceOpWithNewOp<arith::ConstantOp>(op, attr);
       return success();
     }

     return rewriter.notifyMatchFailure(op.getLoc(),
                                        "unhandled constant attribute");
   }
 };

 //===----------------------------------------------------------------------===//
 // ConvertAddI
 //===----------------------------------------------------------------------===//

 struct ConvertAddI final : OpConversionPattern<arith::AddIOp> {
   using OpConversionPattern::OpConversionPattern;

   LogicalResult
   matchAndRewrite(arith::AddIOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op->getLoc();
     auto newTy = getTypeConverter()->convertType<VectorType>(op.getType());
     if (!newTy)
       return rewriter.notifyMatchFailure(
           loc, llvm::formatv("unsupported type: {0}", op.getType()));

     Type newElemTy = reduceInnermostDim(newTy);

     auto [lhsElem0, lhsElem1] =
         extractLastDimHalves(rewriter, loc, adaptor.getLhs());
     auto [rhsElem0, rhsElem1] =
         extractLastDimHalves(rewriter, loc, adaptor.getRhs());

     auto lowSum =
         rewriter.create<arith::AddUIExtendedOp>(loc, lhsElem0, rhsElem0);
     Value overflowVal =
         rewriter.create<arith::ExtUIOp>(loc, newElemTy, lowSum.getOverflow());

     Value high0 = rewriter.create<arith::AddIOp>(loc, overflowVal, lhsElem1);
     Value high = rewriter.create<arith::AddIOp>(loc, high0, rhsElem1);

     Value resultVec =
         constructResultVector(rewriter, loc, newTy, {lowSum.getSum(), high});
     rewriter.replaceOp(op, resultVec);
     return success();
   }
 };

 //===----------------------------------------------------------------------===//
 // ConvertBitwiseBinary
 //===----------------------------------------------------------------------===//

 /// Conversion pattern template for bitwise binary ops, e.g., `arith.andi`.
 template <typename BinaryOp>
 struct ConvertBitwiseBinary final : OpConversionPattern<BinaryOp> {
   using OpConversionPattern<BinaryOp>::OpConversionPattern;
   using OpAdaptor = typename OpConversionPattern<BinaryOp>::OpAdaptor;

   LogicalResult
   matchAndRewrite(BinaryOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op->getLoc();
     auto newTy = this->getTypeConverter()->template convertType<VectorType>(
         op.getType());
     if (!newTy)
       return rewriter.notifyMatchFailure(
           loc, llvm::formatv("unsupported type: {0}", op.getType()));

     auto [lhsElem0, lhsElem1] =
         extractLastDimHalves(rewriter, loc, adaptor.getLhs());
     auto [rhsElem0, rhsElem1] =
         extractLastDimHalves(rewriter, loc, adaptor.getRhs());

     Value resElem0 = rewriter.create<BinaryOp>(loc, lhsElem0, rhsElem0);
     Value resElem1 = rewriter.create<BinaryOp>(loc, lhsElem1, rhsElem1);
     Value resultVec =
         constructResultVector(rewriter, loc, newTy, {resElem0, resElem1});
     rewriter.replaceOp(op, resultVec);
     return success();
   }
 };

 //===----------------------------------------------------------------------===//
 // ConvertCmpI
 //===----------------------------------------------------------------------===//

 /// Returns the matching unsigned version of the given predicate `pred`, or the
 /// same predicate if `pred` is not a signed.
 static arith::CmpIPredicate toUnsignedPredicate(arith::CmpIPredicate pred) {
   using P = arith::CmpIPredicate;
   switch (pred) {
   case P::sge:
     return P::uge;
   case P::sgt:
     return P::ugt;
   case P::sle:
     return P::ule;
   case P::slt:
     return P::ult;
   default:
     return pred;
   }
 }

 struct ConvertCmpI final : OpConversionPattern<arith::CmpIOp> {
   using OpConversionPattern::OpConversionPattern;

   LogicalResult
   matchAndRewrite(arith::CmpIOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op->getLoc();
     auto inputTy =
         getTypeConverter()->convertType<VectorType>(op.getLhs().getType());
     if (!inputTy)
       return rewriter.notifyMatchFailure(
           loc, llvm::formatv("unsupported type: {0}", op.getType()));

     arith::CmpIPredicate highPred = adaptor.getPredicate();
     arith::CmpIPredicate lowPred = toUnsignedPredicate(highPred);

     auto [lhsElem0, lhsElem1] =
         extractLastDimHalves(rewriter, loc, adaptor.getLhs());
     auto [rhsElem0, rhsElem1] =
         extractLastDimHalves(rewriter, loc, adaptor.getRhs());

     Value lowCmp =
         rewriter.create<arith::CmpIOp>(loc, lowPred, lhsElem0, rhsElem0);
     Value highCmp =
         rewriter.create<arith::CmpIOp>(loc, highPred, lhsElem1, rhsElem1);

     Value cmpResult{};
     switch (highPred) {
     case arith::CmpIPredicate::eq: {
       cmpResult = rewriter.create<arith::AndIOp>(loc, lowCmp, highCmp);
       break;
     }
     case arith::CmpIPredicate::ne: {
       cmpResult = rewriter.create<arith::OrIOp>(loc, lowCmp, highCmp);
       break;
     }
     default: {
       // Handle inequality checks.
       Value highEq = rewriter.create<arith::CmpIOp>(
           loc, arith::CmpIPredicate::eq, lhsElem1, rhsElem1);
       cmpResult =
           rewriter.create<arith::SelectOp>(loc, highEq, lowCmp, highCmp);
       break;
     }
     }

     assert(cmpResult && "Unhandled case");
     rewriter.replaceOp(op, dropTrailingX1Dim(rewriter, loc, cmpResult));
     return success();
   }
 };

 //===----------------------------------------------------------------------===//
 // ConvertMulI
 //===----------------------------------------------------------------------===//

 struct ConvertMulI final : OpConversionPattern<arith::MulIOp> {
   using OpConversionPattern::OpConversionPattern;

   LogicalResult
   matchAndRewrite(arith::MulIOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op->getLoc();
     auto newTy = getTypeConverter()->convertType<VectorType>(op.getType());
     if (!newTy)
       return rewriter.notifyMatchFailure(
           loc, llvm::formatv("unsupported type: {0}", op.getType()));

     auto [lhsElem0, lhsElem1] =
         extractLastDimHalves(rewriter, loc, adaptor.getLhs());
     auto [rhsElem0, rhsElem1] =
         extractLastDimHalves(rewriter, loc, adaptor.getRhs());

     // The multiplication algorithm used is the standard (long) multiplication.
     // Multiplying two i2N integers produces (at most) an i4N result, but
     // because the calculation of top i2N is not necessary, we omit it.
     auto mulLowLow =
         rewriter.create<arith::MulUIExtendedOp>(loc, lhsElem0, rhsElem0);
     Value mulLowHi = rewriter.create<arith::MulIOp>(loc, lhsElem0, rhsElem1);
     Value mulHiLow = rewriter.create<arith::MulIOp>(loc, lhsElem1, rhsElem0);

     Value resLow = mulLowLow.getLow();
     Value resHi =
         rewriter.create<arith::AddIOp>(loc, mulLowLow.getHigh(), mulLowHi);
     resHi = rewriter.create<arith::AddIOp>(loc, resHi, mulHiLow);

     Value resultVec =
         constructResultVector(rewriter, loc, newTy, {resLow, resHi});
     rewriter.replaceOp(op, resultVec);
     return success();
   }
 };

 //===----------------------------------------------------------------------===//
 // ConvertExtSI
 //===----------------------------------------------------------------------===//

 struct ConvertExtSI final : OpConversionPattern<arith::ExtSIOp> {
   using OpConversionPattern::OpConversionPattern;

   LogicalResult
   matchAndRewrite(arith::ExtSIOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op->getLoc();
     auto newTy = getTypeConverter()->convertType<VectorType>(op.getType());
     if (!newTy)
       return rewriter.notifyMatchFailure(
           loc, llvm::formatv("unsupported type: {0}", op.getType()));

     Type newResultComponentTy = reduceInnermostDim(newTy);

     // Sign-extend the input value to determine the low half of the result.
     // Then, check if the low half is negative, and sign-extend the comparison
     // result to get the high half.
     Value newOperand = appendX1Dim(rewriter, loc, adaptor.getIn());
     Value extended = rewriter.createOrFold<arith::ExtSIOp>(
         loc, newResultComponentTy, newOperand);
     Value operandZeroCst =
         createScalarOrSplatConstant(rewriter, loc, newResultComponentTy, 0);
     Value signBit = rewriter.create<arith::CmpIOp>(
         loc, arith::CmpIPredicate::slt, extended, operandZeroCst);
     Value signValue =
         rewriter.create<arith::ExtSIOp>(loc, newResultComponentTy, signBit);

     Value resultVec =
         constructResultVector(rewriter, loc, newTy, {extended, signValue});
     rewriter.replaceOp(op, resultVec);
     return success();
   }
 };

 //===----------------------------------------------------------------------===//
 // ConvertExtUI
 //===----------------------------------------------------------------------===//

 struct ConvertExtUI final : OpConversionPattern<arith::ExtUIOp> {
   using OpConversionPattern::OpConversionPattern;

   LogicalResult
   matchAndRewrite(arith::ExtUIOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op->getLoc();
     auto newTy = getTypeConverter()->convertType<VectorType>(op.getType());
     if (!newTy)
       return rewriter.notifyMatchFailure(
           loc, llvm::formatv("unsupported type: {0}", op.getType()));

     Type newResultComponentTy = reduceInnermostDim(newTy);

     // Zero-extend the input value to determine the low half of the result.
     // The high half is always zero.
     Value newOperand = appendX1Dim(rewriter, loc, adaptor.getIn());
     Value extended = rewriter.createOrFold<arith::ExtUIOp>(
         loc, newResultComponentTy, newOperand);
     Value zeroCst = createScalarOrSplatConstant(rewriter, loc, newTy, 0);
     Value newRes = insertLastDimSlice(rewriter, loc, extended, zeroCst, 0);
     rewriter.replaceOp(op, newRes);
     return success();
   }
 };

 //===----------------------------------------------------------------------===//
 // ConvertMaxMin
 //===----------------------------------------------------------------------===//

 template <typename SourceOp, arith::CmpIPredicate CmpPred>
 struct ConvertMaxMin final : OpConversionPattern<SourceOp> {
   using OpConversionPattern<SourceOp>::OpConversionPattern;

   LogicalResult
   matchAndRewrite(SourceOp op, typename SourceOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op->getLoc();

     Type oldTy = op.getType();
     auto newTy = dyn_cast_or_null<VectorType>(
         this->getTypeConverter()->convertType(oldTy));
     if (!newTy)
       return rewriter.notifyMatchFailure(
           loc, llvm::formatv("unsupported type: {0}", op.getType()));

     // Rewrite Max*I/Min*I as compare and select over original operands. Let
     // the CmpI and Select emulation patterns handle the final legalization.
     Value cmp =
         rewriter.create<arith::CmpIOp>(loc, CmpPred, op.getLhs(), op.getRhs());
     rewriter.replaceOpWithNewOp<arith::SelectOp>(op, cmp, op.getLhs(),
                                                  op.getRhs());
     return success();
   }
 };

 // Convert IndexCast ops
 //===----------------------------------------------------------------------===//

 /// Returns true iff the type is `index` or `vector<...index>`.
 static bool isIndexOrIndexVector(Type type) {
   if (isa<IndexType>(type))
     return true;

   if (auto vectorTy = dyn_cast<VectorType>(type))
     if (isa<IndexType>(vectorTy.getElementType()))
       return true;

   return false;
 }

 template <typename CastOp>
 struct ConvertIndexCastIntToIndex final : OpConversionPattern<CastOp> {
   using OpConversionPattern<CastOp>::OpConversionPattern;

   LogicalResult
   matchAndRewrite(CastOp op, typename CastOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Type resultType = op.getType();
     if (!isIndexOrIndexVector(resultType))
       return failure();

     Location loc = op.getLoc();
     Type inType = op.getIn().getType();
     auto newInTy =
         this->getTypeConverter()->template convertType<VectorType>(inType);
     if (!newInTy)
       return rewriter.notifyMatchFailure(
           loc, llvm::formatv("unsupported type: {0}", inType));

     // Discard the high half of the input truncating the original value.
     Value extracted = extractLastDimSlice(rewriter, loc, adaptor.getIn(), 0);
     extracted = dropTrailingX1Dim(rewriter, loc, extracted);
     rewriter.replaceOpWithNewOp<CastOp>(op, resultType, extracted);
     return success();
   }
 };

 template <typename CastOp, typename ExtensionOp>
 struct ConvertIndexCastIndexToInt final : OpConversionPattern<CastOp> {
   using OpConversionPattern<CastOp>::OpConversionPattern;

   LogicalResult
   matchAndRewrite(CastOp op, typename CastOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Type inType = op.getIn().getType();
     if (!isIndexOrIndexVector(inType))
       return failure();

     Location loc = op.getLoc();
     auto *typeConverter =
         this->template getTypeConverter<arith::WideIntEmulationConverter>();

     Type resultType = op.getType();
     auto newTy = typeConverter->template convertType<VectorType>(resultType);
     if (!newTy)
       return rewriter.notifyMatchFailure(
           loc, llvm::formatv("unsupported type: {0}", resultType));

     // Emit an index cast over the matching narrow type.
     Type narrowTy =
         rewriter.getIntegerType(typeConverter->getMaxTargetIntBitWidth());
     if (auto vecTy = dyn_cast<VectorType>(resultType))
       narrowTy = VectorType::get(vecTy.getShape(), narrowTy);

     // Sign or zero-extend the result. Let the matching conversion pattern
     // legalize the extension op.
     Value underlyingVal =
         rewriter.create<CastOp>(loc, narrowTy, adaptor.getIn());
     rewriter.replaceOpWithNewOp<ExtensionOp>(op, resultType, underlyingVal);
     return success();
   }
 };

 //===----------------------------------------------------------------------===//
 // ConvertSelect
 //===----------------------------------------------------------------------===//

 struct ConvertSelect final : OpConversionPattern<arith::SelectOp> {
   using OpConversionPattern::OpConversionPattern;

   LogicalResult
   matchAndRewrite(arith::SelectOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op->getLoc();
     auto newTy = getTypeConverter()->convertType<VectorType>(op.getType());
     if (!newTy)
       return rewriter.notifyMatchFailure(
           loc, llvm::formatv("unsupported type: {0}", op.getType()));

     auto [trueElem0, trueElem1] =
         extractLastDimHalves(rewriter, loc, adaptor.getTrueValue());
     auto [falseElem0, falseElem1] =
         extractLastDimHalves(rewriter, loc, adaptor.getFalseValue());
     Value cond = appendX1Dim(rewriter, loc, adaptor.getCondition());

     Value resElem0 =
         rewriter.create<arith::SelectOp>(loc, cond, trueElem0, falseElem0);
     Value resElem1 =
         rewriter.create<arith::SelectOp>(loc, cond, trueElem1, falseElem1);
     Value resultVec =
         constructResultVector(rewriter, loc, newTy, {resElem0, resElem1});
     rewriter.replaceOp(op, resultVec);
     return success();
   }
 };

 //===----------------------------------------------------------------------===//
 // ConvertShLI
 //===----------------------------------------------------------------------===//

 struct ConvertShLI final : OpConversionPattern<arith::ShLIOp> {
   using OpConversionPattern::OpConversionPattern;

   LogicalResult
   matchAndRewrite(arith::ShLIOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op->getLoc();

     Type oldTy = op.getType();
     auto newTy = getTypeConverter()->convertType<VectorType>(oldTy);
     if (!newTy)
       return rewriter.notifyMatchFailure(
           loc, llvm::formatv("unsupported type: {0}", op.getType()));

     Type newOperandTy = reduceInnermostDim(newTy);
     // `oldBitWidth` == `2 * newBitWidth`
     unsigned newBitWidth = newTy.getElementTypeBitWidth();

     auto [lhsElem0, lhsElem1] =
         extractLastDimHalves(rewriter, loc, adaptor.getLhs());
     Value rhsElem0 = extractLastDimSlice(rewriter, loc, adaptor.getRhs(), 0);

     // Assume that the shift amount is < 2 * newBitWidth. Calculate the low and
     // high halves of the results separately:
     //   1. low := LHS.low shli RHS
     //
     //   2. high := a or b or c, where:
     //     a) Bits from LHS.high, shifted by the RHS.
     //     b) Bits from LHS.low, shifted right. These come into play when
     //        RHS < newBitWidth, e.g.:
     //         [0000][llll] shli 3 --> [0lll][l000]
     //                                    ^
     //                                    |
     //                           [llll] shrui (4 - 3)
     //     c) Bits from LHS.low, shifted left. These matter when
     //        RHS > newBitWidth, e.g.:
     //         [0000][llll] shli 7 --> [l000][0000]
     //                                   ^
     //                                   |
     //                          [llll] shli (7 - 4)
     //
     // Because shifts by values >= newBitWidth are undefined, we ignore the high
     // half of RHS, and introduce 'bounds checks' to account for
     // RHS.low > newBitWidth.
     //
     // TODO: Explore possible optimizations.
     Value zeroCst = createScalarOrSplatConstant(rewriter, loc, newOperandTy, 0);
     Value elemBitWidth =
         createScalarOrSplatConstant(rewriter, loc, newOperandTy, newBitWidth);

     Value illegalElemShift = rewriter.create<arith::CmpIOp>(
         loc, arith::CmpIPredicate::uge, rhsElem0, elemBitWidth);

     Value shiftedElem0 =
         rewriter.create<arith::ShLIOp>(loc, lhsElem0, rhsElem0);
     Value resElem0 = rewriter.create<arith::SelectOp>(loc, illegalElemShift,
                                                       zeroCst, shiftedElem0);

     Value cappedShiftAmount = rewriter.create<arith::SelectOp>(
         loc, illegalElemShift, elemBitWidth, rhsElem0);
     Value rightShiftAmount =
         rewriter.create<arith::SubIOp>(loc, elemBitWidth, cappedShiftAmount);
     Value shiftedRight =
         rewriter.create<arith::ShRUIOp>(loc, lhsElem0, rightShiftAmount);
     Value overshotShiftAmount =
         rewriter.create<arith::SubIOp>(loc, rhsElem0, elemBitWidth);
     Value shiftedLeft =
         rewriter.create<arith::ShLIOp>(loc, lhsElem0, overshotShiftAmount);

     Value shiftedElem1 =
         rewriter.create<arith::ShLIOp>(loc, lhsElem1, rhsElem0);
     Value resElem1High = rewriter.create<arith::SelectOp>(
         loc, illegalElemShift, zeroCst, shiftedElem1);
     Value resElem1Low = rewriter.create<arith::SelectOp>(
         loc, illegalElemShift, shiftedLeft, shiftedRight);
     Value resElem1 =
         rewriter.create<arith::OrIOp>(loc, resElem1Low, resElem1High);

     Value resultVec =
         constructResultVector(rewriter, loc, newTy, {resElem0, resElem1});
     rewriter.replaceOp(op, resultVec);
     return success();
   }
 };

 //===----------------------------------------------------------------------===//
 // ConvertShRUI
 //===----------------------------------------------------------------------===//

 struct ConvertShRUI final : OpConversionPattern<arith::ShRUIOp> {
   using OpConversionPattern::OpConversionPattern;

   LogicalResult
   matchAndRewrite(arith::ShRUIOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op->getLoc();

     Type oldTy = op.getType();
     auto newTy = getTypeConverter()->convertType<VectorType>(oldTy);
     if (!newTy)
       return rewriter.notifyMatchFailure(
           loc, llvm::formatv("unsupported type: {0}", op.getType()));

     Type newOperandTy = reduceInnermostDim(newTy);
     // `oldBitWidth` == `2 * newBitWidth`
     unsigned newBitWidth = newTy.getElementTypeBitWidth();

     auto [lhsElem0, lhsElem1] =
         extractLastDimHalves(rewriter, loc, adaptor.getLhs());
     Value rhsElem0 = extractLastDimSlice(rewriter, loc, adaptor.getRhs(), 0);

     // Assume that the shift amount is < 2 * newBitWidth. Calculate the low and
     // high halves of the results separately:
     //   1. low := a or b or c, where:
     //     a) Bits from LHS.low, shifted by the RHS.
     //     b) Bits from LHS.high, shifted left. These matter when
     //        RHS < newBitWidth, e.g.:
     //         [hhhh][0000] shrui 3 --> [000h][hhh0]
     //                                          ^
     //                                          |
     //                                 [hhhh] shli (4 - 1)
     //     c) Bits from LHS.high, shifted right. These come into play when
     //        RHS > newBitWidth, e.g.:
     //         [hhhh][0000] shrui 7 --> [0000][000h]
     //                                          ^
     //                                          |
     //                                 [hhhh] shrui (7 - 4)
     //
     //   2. high := LHS.high shrui RHS
     //
     // Because shifts by values >= newBitWidth are undefined, we ignore the high
     // half of RHS, and introduce 'bounds checks' to account for
     // RHS.low > newBitWidth.
     //
     // TODO: Explore possible optimizations.
     Value zeroCst = createScalarOrSplatConstant(rewriter, loc, newOperandTy, 0);
     Value elemBitWidth =
         createScalarOrSplatConstant(rewriter, loc, newOperandTy, newBitWidth);

     Value illegalElemShift = rewriter.create<arith::CmpIOp>(
         loc, arith::CmpIPredicate::uge, rhsElem0, elemBitWidth);

     Value shiftedElem0 =
         rewriter.create<arith::ShRUIOp>(loc, lhsElem0, rhsElem0);
     Value resElem0Low = rewriter.create<arith::SelectOp>(loc, illegalElemShift,
                                                          zeroCst, shiftedElem0);
     Value shiftedElem1 =
         rewriter.create<arith::ShRUIOp>(loc, lhsElem1, rhsElem0);
     Value resElem1 = rewriter.create<arith::SelectOp>(loc, illegalElemShift,
                                                       zeroCst, shiftedElem1);

     Value cappedShiftAmount = rewriter.create<arith::SelectOp>(
         loc, illegalElemShift, elemBitWidth, rhsElem0);
     Value leftShiftAmount =
         rewriter.create<arith::SubIOp>(loc, elemBitWidth, cappedShiftAmount);
     Value shiftedLeft =
         rewriter.create<arith::ShLIOp>(loc, lhsElem1, leftShiftAmount);
     Value overshotShiftAmount =
         rewriter.create<arith::SubIOp>(loc, rhsElem0, elemBitWidth);
     Value shiftedRight =
         rewriter.create<arith::ShRUIOp>(loc, lhsElem1, overshotShiftAmount);

     Value resElem0High = rewriter.create<arith::SelectOp>(
         loc, illegalElemShift, shiftedRight, shiftedLeft);
     Value resElem0 =
         rewriter.create<arith::OrIOp>(loc, resElem0Low, resElem0High);

     Value resultVec =
         constructResultVector(rewriter, loc, newTy, {resElem0, resElem1});
     rewriter.replaceOp(op, resultVec);
     return success();
   }
 };

 //===----------------------------------------------------------------------===//
 // ConvertShRSI
 //===----------------------------------------------------------------------===//

 struct ConvertShRSI final : OpConversionPattern<arith::ShRSIOp> {
   using OpConversionPattern::OpConversionPattern;

   LogicalResult
   matchAndRewrite(arith::ShRSIOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op->getLoc();

     Type oldTy = op.getType();
     auto newTy = getTypeConverter()->convertType<VectorType>(oldTy);
     if (!newTy)
       return rewriter.notifyMatchFailure(
           loc, llvm::formatv("unsupported type: {0}", op.getType()));

     Value lhsElem1 = extractLastDimSlice(rewriter, loc, adaptor.getLhs(), 1);
     Value rhsElem0 = extractLastDimSlice(rewriter, loc, adaptor.getRhs(), 0);

     Type narrowTy = rhsElem0.getType();
     int64_t origBitwidth = newTy.getElementTypeBitWidth() * 2;

     // Rewrite this as an bitwise or of `arith.shrui` and sign extension bits.
     // Perform as many ops over the narrow integer type as possible and let the
     // other emulation patterns convert the rest.
     Value elemZero = createScalarOrSplatConstant(rewriter, loc, narrowTy, 0);
     Value signBit = rewriter.create<arith::CmpIOp>(
         loc, arith::CmpIPredicate::slt, lhsElem1, elemZero);
     signBit = dropTrailingX1Dim(rewriter, loc, signBit);

     // Create a bit pattern of either all ones or all zeros. Then shift it left
     // to calculate the sign extension bits created by shifting the original
     // sign bit right.
     Value allSign = rewriter.create<arith::ExtSIOp>(loc, oldTy, signBit);
     Value maxShift =
         createScalarOrSplatConstant(rewriter, loc, narrowTy, origBitwidth);
     Value numNonSignExtBits =
         rewriter.create<arith::SubIOp>(loc, maxShift, rhsElem0);
     numNonSignExtBits = dropTrailingX1Dim(rewriter, loc, numNonSignExtBits);
     numNonSignExtBits =
         rewriter.create<arith::ExtUIOp>(loc, oldTy, numNonSignExtBits);
     Value signBits =
         rewriter.create<arith::ShLIOp>(loc, allSign, numNonSignExtBits);

     // Use original arguments to create the right shift.
     Value shrui =
         rewriter.create<arith::ShRUIOp>(loc, op.getLhs(), op.getRhs());
     Value shrsi = rewriter.create<arith::OrIOp>(loc, shrui, signBits);

     // Handle shifting by zero. This is necessary when the `signBits` shift is
     // invalid.
     Value isNoop = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
                                                   rhsElem0, elemZero);
     isNoop = dropTrailingX1Dim(rewriter, loc, isNoop);
     rewriter.replaceOpWithNewOp<arith::SelectOp>(op, isNoop, op.getLhs(),
                                                  shrsi);

     return success();
   }
 };

 //===----------------------------------------------------------------------===//
 // ConvertSubI
 //===----------------------------------------------------------------------===//

 struct ConvertSubI final : OpConversionPattern<arith::SubIOp> {
   using OpConversionPattern::OpConversionPattern;

   LogicalResult
   matchAndRewrite(arith::SubIOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op->getLoc();
     auto newTy = getTypeConverter()->convertType<VectorType>(op.getType());
     if (!newTy)
       return rewriter.notifyMatchFailure(
           loc, llvm::formatv("unsupported type: {}", op.getType()));

     Type newElemTy = reduceInnermostDim(newTy);

     auto [lhsElem0, lhsElem1] =
         extractLastDimHalves(rewriter, loc, adaptor.getLhs());
     auto [rhsElem0, rhsElem1] =
         extractLastDimHalves(rewriter, loc, adaptor.getRhs());

     // Emulates LHS - RHS by [LHS0 - RHS0, LHS1 - RHS1 - CARRY] where
     // CARRY is 1 or 0.
     Value low = rewriter.create<arith::SubIOp>(loc, lhsElem0, rhsElem0);
     // We have a carry if lhsElem0 < rhsElem0.
     Value carry0 = rewriter.create<arith::CmpIOp>(
         loc, arith::CmpIPredicate::ult, lhsElem0, rhsElem0);
     Value carryVal = rewriter.create<arith::ExtUIOp>(loc, newElemTy, carry0);

     Value high0 = rewriter.create<arith::SubIOp>(loc, lhsElem1, carryVal);
     Value high = rewriter.create<arith::SubIOp>(loc, high0, rhsElem1);

     Value resultVec = constructResultVector(rewriter, loc, newTy, {low, high});
     rewriter.replaceOp(op, resultVec);
     return success();
   }
 };

 //===----------------------------------------------------------------------===//
 // ConvertSIToFP
 //===----------------------------------------------------------------------===//

 struct ConvertSIToFP final : OpConversionPattern<arith::SIToFPOp> {
   using OpConversionPattern::OpConversionPattern;

   LogicalResult
   matchAndRewrite(arith::SIToFPOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op.getLoc();

     Value in = op.getIn();
     Type oldTy = in.getType();
     auto newTy = getTypeConverter()->convertType<VectorType>(oldTy);
     if (!newTy)
       return rewriter.notifyMatchFailure(
           loc, llvm::formatv("unsupported type: {0}", oldTy));

     Value zeroCst = createScalarOrSplatConstant(rewriter, loc, oldTy, 0);

     // To avoid operating on very large unsigned numbers, perform the
     // conversion on the absolute value. Then, decide whether to negate the
     // result or not based on that sign bit. We implement negation by
     // subtracting from zero. Note that this relies on the the other conversion
     // patterns to legalize created ops and narrow the bit widths.
     Value isNeg = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt,
                                                  in, zeroCst);
     Value neg = rewriter.create<arith::SubIOp>(loc, zeroCst, in);
     Value abs = rewriter.create<arith::SelectOp>(loc, isNeg, neg, in);

     Value absResult = rewriter.create<arith::UIToFPOp>(loc, op.getType(), abs);
     Value negResult = rewriter.create<arith::NegFOp>(loc, absResult);
     rewriter.replaceOpWithNewOp<arith::SelectOp>(op, isNeg, negResult,
                                                  absResult);
     return success();
   }
 };

 //===----------------------------------------------------------------------===//
 // ConvertUIToFP
 //===----------------------------------------------------------------------===//

 struct ConvertUIToFP final : OpConversionPattern<arith::UIToFPOp> {
   using OpConversionPattern::OpConversionPattern;

   LogicalResult
   matchAndRewrite(arith::UIToFPOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op.getLoc();

     Type oldTy = op.getIn().getType();
     auto newTy = getTypeConverter()->convertType<VectorType>(oldTy);
     if (!newTy)
       return rewriter.notifyMatchFailure(
           loc, llvm::formatv("unsupported type: {0}", oldTy));
     unsigned newBitWidth = newTy.getElementTypeBitWidth();

     auto [low, hi] = extractLastDimHalves(rewriter, loc, adaptor.getIn());
     Value lowInt = dropTrailingX1Dim(rewriter, loc, low);
     Value hiInt = dropTrailingX1Dim(rewriter, loc, hi);
     Value zeroCst =
         createScalarOrSplatConstant(rewriter, loc, hiInt.getType(), 0);

     // The final result has the following form:
     //   if (hi == 0) return uitofp(low)
     //   else         return uitofp(low) + uitofp(hi) * 2^BW
     //
     // where `BW` is the bitwidth of the narrowed integer type. We emit a
     // select to make it easier to fold-away the `hi` part calculation when it
     // is known to be zero.
     //
     // Note 1: The emulation is precise only for input values that have exact
     // integer representation in the result floating point type, and may lead
     // loss of precision otherwise.
     //
     // Note 2: We do not strictly need the `hi == 0`, case, but it makes
     // constant folding easier.
     Value hiEqZero = rewriter.create<arith::CmpIOp>(
         loc, arith::CmpIPredicate::eq, hiInt, zeroCst);

     Type resultTy = op.getType();
     Type resultElemTy = getElementTypeOrSelf(resultTy);
     Value lowFp = rewriter.create<arith::UIToFPOp>(loc, resultTy, lowInt);
     Value hiFp = rewriter.create<arith::UIToFPOp>(loc, resultTy, hiInt);

     int64_t pow2Int = int64_t(1) << newBitWidth;
     TypedAttr pow2Attr =
         rewriter.getFloatAttr(resultElemTy, static_cast<double>(pow2Int));
     if (auto vecTy = dyn_cast<VectorType>(resultTy))
       pow2Attr = SplatElementsAttr::get(vecTy, pow2Attr);

     Value pow2Val = rewriter.create<arith::ConstantOp>(loc, resultTy, pow2Attr);

     Value hiVal = rewriter.create<arith::MulFOp>(loc, hiFp, pow2Val);
     Value result = rewriter.create<arith::AddFOp>(loc, lowFp, hiVal);

     rewriter.replaceOpWithNewOp<arith::SelectOp>(op, hiEqZero, lowFp, result);
     return success();
   }
 };

 //===----------------------------------------------------------------------===//
 // ConvertFPToSI
 //===----------------------------------------------------------------------===//

 struct ConvertFPToSI final : OpConversionPattern<arith::FPToSIOp> {
   using OpConversionPattern::OpConversionPattern;

   LogicalResult
   matchAndRewrite(arith::FPToSIOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
     // Get the input float type.
     Value inFp = adaptor.getIn();
     Type fpTy = inFp.getType();

     Type intTy = op.getType();

     auto newTy = getTypeConverter()->convertType<VectorType>(intTy);
     if (!newTy)
       return rewriter.notifyMatchFailure(
           loc, llvm::formatv("unsupported type: {}", intTy));

     // Work on the absolute value and then convert the result to signed integer.
     // Defer absolute value to fptoui. If minSInt < fp < maxSInt, i.e. if the fp
     // is representable in signed i2N, emits the correct result. Else, the
     // result is UB.

     TypedAttr zeroAttr = rewriter.getZeroAttr(fpTy);
     Value zeroCst = rewriter.create<arith::ConstantOp>(loc, zeroAttr);
     Value zeroCstInt = createScalarOrSplatConstant(rewriter, loc, intTy, 0);

     // Get the absolute value. One could have used math.absf here, but that
     // introduces an extra dependency.
     Value isNeg = rewriter.create<arith::CmpFOp>(loc, arith::CmpFPredicate::OLT,
                                                  inFp, zeroCst);
     Value negInFp = rewriter.create<arith::NegFOp>(loc, inFp);

     Value absVal = rewriter.create<arith::SelectOp>(loc, isNeg, negInFp, inFp);

     // Defer the absolute value to fptoui.
     Value res = rewriter.create<arith::FPToUIOp>(loc, intTy, absVal);

     // Negate the value if < 0 .
     Value neg = rewriter.create<arith::SubIOp>(loc, zeroCstInt, res);

     rewriter.replaceOpWithNewOp<arith::SelectOp>(op, isNeg, neg, res);
     return success();
   }
 };

 //===----------------------------------------------------------------------===//
 // ConvertFPToUI
 //===----------------------------------------------------------------------===//

 struct ConvertFPToUI final : OpConversionPattern<arith::FPToUIOp> {
   using OpConversionPattern::OpConversionPattern;

   LogicalResult
   matchAndRewrite(arith::FPToUIOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
     // Get the input float type.
     Value inFp = adaptor.getIn();
     Type fpTy = inFp.getType();

     Type intTy = op.getType();
     auto newTy = getTypeConverter()->convertType<VectorType>(intTy);
     if (!newTy)
       return rewriter.notifyMatchFailure(
           loc, llvm::formatv("unsupported type: {}", intTy));
     unsigned newBitWidth = newTy.getElementTypeBitWidth();

     Type newHalfType = IntegerType::get(inFp.getContext(), newBitWidth);
     if (auto vecType = dyn_cast<VectorType>(fpTy))
       newHalfType = VectorType::get(vecType.getShape(), newHalfType);

     // The resulting integer has the upper part and the lower part. This would
     // be interpreted as 2^N * high + low, where N is the bitwidth. Therefore,
     // to calculate the higher part, we emit resHigh = fptoui(fp/2^N). For the
     // lower part, we emit fptoui(fp - resHigh * 2^N). The special cases of
     // overflows including +-inf, NaNs and negative numbers are UB.

     const llvm::fltSemantics &fSemantics =
         cast<FloatType>(getElementTypeOrSelf(fpTy)).getFloatSemantics();

     auto powBitwidth = llvm::APFloat(fSemantics);
     // If the integer does not fit the floating point number, we set the
     // powBitwidth to inf. This ensures that the upper part is set
     // correctly to 0. The opStatus inexact here only occurs when we have an
     // overflow, since the number is always a power of two.
     if (powBitwidth.convertFromAPInt(APInt(newBitWidth * 2, 1).shl(newBitWidth),
                                      false, llvm::RoundingMode::TowardZero) ==
         llvm::detail::opStatus::opInexact)
       powBitwidth = llvm::APFloat::getInf(fSemantics);

     TypedAttr powBitwidthAttr =
         FloatAttr::get(getElementTypeOrSelf(fpTy), powBitwidth);
     if (auto vecType = dyn_cast<VectorType>(fpTy))
       powBitwidthAttr = SplatElementsAttr::get(vecType, powBitwidthAttr);
     Value powBitwidthFloatCst =
         rewriter.create<arith::ConstantOp>(loc, powBitwidthAttr);

     Value fpDivPowBitwidth =
         rewriter.create<arith::DivFOp>(loc, inFp, powBitwidthFloatCst);
     Value resHigh =
         rewriter.create<arith::FPToUIOp>(loc, newHalfType, fpDivPowBitwidth);
     // Calculate fp - resHigh * 2^N by getting the remainder of the division
     Value remainder =
         rewriter.create<arith::RemFOp>(loc, inFp, powBitwidthFloatCst);
     Value resLow =
         rewriter.create<arith::FPToUIOp>(loc, newHalfType, remainder);

     Value high = appendX1Dim(rewriter, loc, resHigh);
     Value low = appendX1Dim(rewriter, loc, resLow);

     Value resultVec = constructResultVector(rewriter, loc, newTy, {low, high});

     rewriter.replaceOp(op, resultVec);
     return success();
   }
 };

 //===----------------------------------------------------------------------===//
 // ConvertTruncI
 //===----------------------------------------------------------------------===//

 struct ConvertTruncI final : OpConversionPattern<arith::TruncIOp> {
   using OpConversionPattern::OpConversionPattern;

   LogicalResult
   matchAndRewrite(arith::TruncIOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
     // Check if the result type is legal for this target. Currently, we do not
     // support truncation to types wider than supported by the target.
     if (!getTypeConverter()->isLegal(op.getType()))
       return rewriter.notifyMatchFailure(
           loc, llvm::formatv("unsupported truncation result type: {0}",
                              op.getType()));

     // Discard the high half of the input. Truncate the low half, if
     // necessary.
     Value extracted = extractLastDimSlice(rewriter, loc, adaptor.getIn(), 0);
     extracted = dropTrailingX1Dim(rewriter, loc, extracted);
     Value truncated =
         rewriter.createOrFold<arith::TruncIOp>(loc, op.getType(), extracted);
     rewriter.replaceOp(op, truncated);
     return success();
   }
 };

 //===----------------------------------------------------------------------===//
 // ConvertVectorPrint
 //===----------------------------------------------------------------------===//

 struct ConvertVectorPrint final : OpConversionPattern<vector::PrintOp> {
   using OpConversionPattern::OpConversionPattern;

   LogicalResult
   matchAndRewrite(vector::PrintOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     rewriter.replaceOpWithNewOp<vector::PrintOp>(op, adaptor.getSource());
     return success();
   }
 };

 //===----------------------------------------------------------------------===//
 // Pass Definition
 //===----------------------------------------------------------------------===//

 struct EmulateWideIntPass final
     : arith::impl::ArithEmulateWideIntBase<EmulateWideIntPass> {
   using ArithEmulateWideIntBase::ArithEmulateWideIntBase;

   void runOnOperation() override {
     if (!llvm::isPowerOf2_32(widestIntSupported) || widestIntSupported < 2) {
       signalPassFailure();
       return;
     }

     Operation *op = getOperation();
     MLIRContext *ctx = op->getContext();

     arith::WideIntEmulationConverter typeConverter(widestIntSupported);
     ConversionTarget target(*ctx);
     target.addDynamicallyLegalOp<func::FuncOp>([&typeConverter](Operation *op) {
       return typeConverter.isLegal(cast<func::FuncOp>(op).getFunctionType());
     });
     auto opLegalCallback = [&typeConverter](Operation *op) {
       return typeConverter.isLegal(op);
     };
     target.addDynamicallyLegalOp<func::CallOp, func::ReturnOp>(opLegalCallback);
     target.addDynamicallyLegalOp<vector::PrintOp>(opLegalCallback);
     target.addDynamicallyLegalDialect<arith::ArithDialect>(opLegalCallback);
     target.addLegalDialect<vector::VectorDialect>();

     RewritePatternSet patterns(ctx);
     arith::populateArithWideIntEmulationPatterns(typeConverter, patterns);

     // Populate `func.*` conversion patterns.
     populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(
         patterns, typeConverter);
     populateCallOpTypeConversionPattern(patterns, typeConverter);
     populateReturnOpTypeConversionPattern(patterns, typeConverter);

     if (failed(applyPartialConversion(op, target, std::move(patterns))))
       signalPassFailure();
   }
 };
 } // end anonymous namespace

 //===----------------------------------------------------------------------===//
 // Public Interface Definition
 //===----------------------------------------------------------------------===//

 arith::WideIntEmulationConverter::WideIntEmulationConverter(
     unsigned widestIntSupportedByTarget)
     : maxIntWidth(widestIntSupportedByTarget) {
   assert(llvm::isPowerOf2_32(widestIntSupportedByTarget) &&
          "Only power-of-two integers with are supported");
   assert(widestIntSupportedByTarget >= 2 && "Integer type too narrow");

   // Allow unknown types.
   addConversion([](Type ty) -> std::optional<Type> { return ty; });

   // Scalar case.
   addConversion([this](IntegerType ty) -> std::optional<Type> {
     unsigned width = ty.getWidth();
     if (width <= maxIntWidth)
       return ty;

     // i2N --> vector<2xiN>
     if (width == 2 * maxIntWidth)
       return VectorType::get(2, IntegerType::get(ty.getContext(), maxIntWidth));

     return nullptr;
   });

   // Vector case.
   addConversion([this](VectorType ty) -> std::optional<Type> {
     auto intTy = dyn_cast<IntegerType>(ty.getElementType());
     if (!intTy)
       return ty;

     unsigned width = intTy.getWidth();
     if (width <= maxIntWidth)
       return ty;

     // vector<...xi2N> --> vector<...x2xiN>
     if (width == 2 * maxIntWidth) {
       auto newShape = to_vector(ty.getShape());
       newShape.push_back(2);
       return VectorType::get(newShape,
                              IntegerType::get(ty.getContext(), maxIntWidth));
     }

     return nullptr;
   });

   // Function case.
   addConversion([this](FunctionType ty) -> std::optional<Type> {
     // Convert inputs and results, e.g.:
     //   (i2N, i2N) -> i2N --> (vector<2xiN>, vector<2xiN>) -> vector<2xiN>
     SmallVector<Type> inputs;
     if (failed(convertTypes(ty.getInputs(), inputs)))
       return nullptr;

     SmallVector<Type> results;
     if (failed(convertTypes(ty.getResults(), results)))
       return nullptr;

     return FunctionType::get(ty.getContext(), inputs, results);
   });
 }

 void arith::populateArithWideIntEmulationPatterns(
     const WideIntEmulationConverter &typeConverter,
     RewritePatternSet &patterns) {
   // Populate `arith.*` conversion patterns.
   patterns.add<
       // Misc ops.
       ConvertConstant, ConvertCmpI, ConvertSelect, ConvertVectorPrint,
       // Binary ops.
       ConvertAddI, ConvertMulI, ConvertShLI, ConvertShRSI, ConvertShRUI,
       ConvertMaxMin<arith::MaxUIOp, arith::CmpIPredicate::ugt>,
       ConvertMaxMin<arith::MaxSIOp, arith::CmpIPredicate::sgt>,
       ConvertMaxMin<arith::MinUIOp, arith::CmpIPredicate::ult>,
       ConvertMaxMin<arith::MinSIOp, arith::CmpIPredicate::slt>, ConvertSubI,
       // Bitwise binary ops.
       ConvertBitwiseBinary<arith::AndIOp>, ConvertBitwiseBinary<arith::OrIOp>,
       ConvertBitwiseBinary<arith::XOrIOp>,
       // Extension and truncation ops.
       ConvertExtSI, ConvertExtUI, ConvertTruncI,
       // Cast ops.
       ConvertIndexCastIntToIndex<arith::IndexCastOp>,
       ConvertIndexCastIntToIndex<arith::IndexCastUIOp>,
       ConvertIndexCastIndexToInt<arith::IndexCastOp, arith::ExtSIOp>,
       ConvertIndexCastIndexToInt<arith::IndexCastUIOp, arith::ExtUIOp>,
       ConvertSIToFP, ConvertUIToFP, ConvertFPToUI, ConvertFPToSI>(
       typeConverter, patterns.getContext());
 }