lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp - llvm-project/flang - Git at Google

 //===- SimplifyHLFIRIntrinsics.cpp - Simplify HLFIR Intrinsics ------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 // Normally transformational intrinsics are lowered to calls to runtime
 // functions. However, some cases of the intrinsics are faster when inlined
 // into the calling function.
 //===----------------------------------------------------------------------===//

 #include "flang/Optimizer/Builder/Character.h"
 #include "flang/Optimizer/Builder/Complex.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/HLFIRTools.h"
 #include "flang/Optimizer/Builder/IntrinsicCall.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/HLFIR/HLFIRDialect.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/IR/Location.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"

 namespace hlfir {
 #define GEN_PASS_DEF_SIMPLIFYHLFIRINTRINSICS
 #include "flang/Optimizer/HLFIR/Passes.h.inc"
 } // namespace hlfir

 #define DEBUG_TYPE "simplify-hlfir-intrinsics"

 static llvm::cl::opt<bool> forceMatmulAsElemental(
     "flang-inline-matmul-as-elemental",
     llvm::cl::desc("Expand hlfir.matmul as elemental operation"),
     llvm::cl::init(false));

 namespace {

 // Helper class to generate operations related to computing
 // product of values.
 class ProductFactory {
 public:
   ProductFactory(mlir::Location loc, fir::FirOpBuilder &builder)
       : loc(loc), builder(builder) {}

   // Generate an update of the inner product value:
   //   acc += v1 * v2, OR
   //   acc += CONJ(v1) * v2, OR
   //   acc ||= v1 && v2
   //
   // CONJ parameter specifies whether the first complex product argument
   // needs to be conjugated.
   template <bool CONJ = false>
   mlir::Value genAccumulateProduct(mlir::Value acc, mlir::Value v1,
                                    mlir::Value v2) {
     mlir::Type resultType = acc.getType();
     acc = castToProductType(acc, resultType);
     v1 = castToProductType(v1, resultType);
     v2 = castToProductType(v2, resultType);
     mlir::Value result;
     if (mlir::isa<mlir::FloatType>(resultType)) {
       result = mlir::arith::AddFOp::create(
           builder, loc, acc, mlir::arith::MulFOp::create(builder, loc, v1, v2));
     } else if (mlir::isa<mlir::ComplexType>(resultType)) {
       if constexpr (CONJ)
         result = fir::IntrinsicLibrary{builder, loc}.genConjg(resultType, v1);
       else
         result = v1;

       result = fir::AddcOp::create(
           builder, loc, acc, fir::MulcOp::create(builder, loc, result, v2));
     } else if (mlir::isa<mlir::IntegerType>(resultType)) {
       result = mlir::arith::AddIOp::create(
           builder, loc, acc, mlir::arith::MulIOp::create(builder, loc, v1, v2));
     } else if (mlir::isa<fir::LogicalType>(resultType)) {
       result = mlir::arith::OrIOp::create(
           builder, loc, acc, mlir::arith::AndIOp::create(builder, loc, v1, v2));
     } else {
       llvm_unreachable("unsupported type");
     }

     return builder.createConvert(loc, resultType, result);
   }

 private:
   mlir::Location loc;
   fir::FirOpBuilder &builder;

   mlir::Value castToProductType(mlir::Value value, mlir::Type type) {
     if (mlir::isa<fir::LogicalType>(type))
       return builder.createConvert(loc, builder.getIntegerType(1), value);

     // TODO: the multiplications/additions by/of zero resulting from
     // complex * real are optimized by LLVM under -fno-signed-zeros
     // -fno-honor-nans.
     // We can make them disappear by default if we:
     //   * either expand the complex multiplication into real
     //     operations, OR
     //   * set nnan nsz fast-math flags to the complex operations.
     if (fir::isa_complex(type) && !fir::isa_complex(value.getType())) {
       mlir::Value zeroCmplx = fir::factory::createZeroValue(builder, loc, type);
       fir::factory::Complex helper(builder, loc);
       mlir::Type partType = helper.getComplexPartType(type);
       return helper.insertComplexPart(zeroCmplx,
                                       castToProductType(value, partType),
                                       /*isImagPart=*/false);
     }
     return builder.createConvert(loc, type, value);
   }
 };

 class TransposeAsElementalConversion
     : public mlir::OpRewritePattern<hlfir::TransposeOp> {
 public:
   using mlir::OpRewritePattern<hlfir::TransposeOp>::OpRewritePattern;

   llvm::LogicalResult
   matchAndRewrite(hlfir::TransposeOp transpose,
                   mlir::PatternRewriter &rewriter) const override {
     hlfir::ExprType expr = transpose.getType();
     // TODO: hlfir.elemental supports polymorphic data types now,
     // so this can be supported.
     if (expr.isPolymorphic())
       return rewriter.notifyMatchFailure(transpose,
                                          "TRANSPOSE of polymorphic type");

     mlir::Location loc = transpose.getLoc();
     fir::FirOpBuilder builder{rewriter, transpose.getOperation()};
     mlir::Type elementType = expr.getElementType();
     hlfir::Entity array = hlfir::Entity{transpose.getArray()};
     mlir::Value resultShape = genResultShape(loc, builder, array);
     llvm::SmallVector<mlir::Value, 1> typeParams;
     hlfir::genLengthParameters(loc, builder, array, typeParams);

     auto genKernel = [&array](mlir::Location loc, fir::FirOpBuilder &builder,
                               mlir::ValueRange inputIndices) -> hlfir::Entity {
       assert(inputIndices.size() == 2 && "checked in TransposeOp::validate");
       const std::initializer_list<mlir::Value> initList = {inputIndices[1],
                                                            inputIndices[0]};
       mlir::ValueRange transposedIndices(initList);
       hlfir::Entity element =
           hlfir::getElementAt(loc, builder, array, transposedIndices);
       hlfir::Entity val = hlfir::loadTrivialScalar(loc, builder, element);
       return val;
     };
     hlfir::ElementalOp elementalOp = hlfir::genElementalOp(
         loc, builder, elementType, resultShape, typeParams, genKernel,
         /*isUnordered=*/true, /*polymorphicMold=*/nullptr,
         transpose.getResult().getType());

     // it wouldn't be safe to replace block arguments with a different
     // hlfir.expr type. Types can differ due to differing amounts of shape
     // information
     assert(elementalOp.getResult().getType() ==
            transpose.getResult().getType());

     rewriter.replaceOp(transpose, elementalOp);
     return mlir::success();
   }

 private:
   static mlir::Value genResultShape(mlir::Location loc,
                                     fir::FirOpBuilder &builder,
                                     hlfir::Entity array) {
     llvm::SmallVector<mlir::Value, 2> inExtents =
         hlfir::genExtentsVector(loc, builder, array);

     // transpose indices
     assert(inExtents.size() == 2 && "checked in TransposeOp::validate");
     return fir::ShapeOp::create(builder, loc,
                                 mlir::ValueRange{inExtents[1], inExtents[0]});
   }
 };

 /// Base class for converting reduction-like operations into
 /// a reduction loop[-nest] optionally wrapped into hlfir.elemental.
 /// It is used to handle operations produced for ALL, ANY, COUNT,
 /// MAXLOC, MAXVAL, MINLOC, MINVAL, SUM intrinsics.
 ///
 /// All of these operations take an input array, and optional
 /// dim, mask arguments. ALL, ANY, COUNT do not have mask argument.
 class ReductionAsElementalConverter {
 public:
   ReductionAsElementalConverter(mlir::Operation *op,
                                 mlir::PatternRewriter &rewriter)
       : op{op}, rewriter{rewriter}, loc{op->getLoc()}, builder{rewriter, op} {
     assert(op->getNumResults() == 1);
   }
   virtual ~ReductionAsElementalConverter() {}

   /// Do the actual conversion or return mlir::failure(),
   /// if conversion is not possible.
   mlir::LogicalResult convert();

 private:
   // Return fir.shape specifying the shape of the result
   // of a reduction with DIM=dimVal. The second return value
   // is the extent of the DIM dimension.
   std::tuple<mlir::Value, mlir::Value>
   genResultShapeForPartialReduction(hlfir::Entity array, int64_t dimVal);

   /// \p mask is a scalar or array logical mask.
   /// If \p isPresentPred is not nullptr, it is a dynamic predicate value
   /// identifying whether the mask's variable is present.
   /// \p indices is a range of one-based indices to access \p mask
   /// when it is an array.
   ///
   /// The method returns the scalar mask value to guard the access
   /// to a single element of the input array.
   mlir::Value genMaskValue(mlir::Value mask, mlir::Value isPresentPred,
                            mlir::ValueRange indices);

 protected:
   /// Return the input array.
   virtual mlir::Value getSource() const = 0;

   /// Return DIM or nullptr, if it is not present.
   virtual mlir::Value getDim() const = 0;

   /// Return MASK or nullptr, if it is not present.
   virtual mlir::Value getMask() const { return nullptr; }

   /// Return FastMathFlags attached to the operation
   /// or arith::FastMathFlags::none, if the operation
   /// does not support FastMathFlags (e.g. ALL, ANY, COUNT).
   virtual mlir::arith::FastMathFlags getFastMath() const {
     return mlir::arith::FastMathFlags::none;
   }

   /// Generates initial values for the reduction values used
   /// by the reduction loop. In general, there is a single
   /// loop-carried reduction value (e.g. for SUM), but, for example,
   /// MAXLOC/MINLOC implementation uses multiple reductions.
   /// \p oneBasedIndices contains any array indices predefined
   /// before the reduction loop, i.e. it is empty for total
   /// reductions, and contains the one-based indices of the wrapping
   /// hlfir.elemental.
   /// \p extents are the pre-computed extents of the input array.
   /// For total reductions, \p extents holds extents of all dimensions.
   /// For partial reductions, \p extents holds a single extent
   /// of the DIM dimension.
   virtual llvm::SmallVector<mlir::Value>
   genReductionInitValues(mlir::ValueRange oneBasedIndices,
                          const llvm::SmallVectorImpl<mlir::Value> &extents) = 0;

   /// Perform reduction(s) update given a single input array's element
   /// identified by \p array and \p oneBasedIndices coordinates.
   /// \p currentValue specifies the current value(s) of the reduction(s)
   /// inside the reduction loop body.
   virtual llvm::SmallVector<mlir::Value>
   reduceOneElement(const llvm::SmallVectorImpl<mlir::Value> &currentValue,
                    hlfir::Entity array, mlir::ValueRange oneBasedIndices) = 0;

   /// Given reduction value(s) in \p reductionResults produced
   /// by the reduction loop, apply any required updates and return
   /// new reduction value(s) to be used after the reduction loop
   /// (e.g. as the result yield of the wrapping hlfir.elemental).
   /// NOTE: if the reduction loop is wrapped in hlfir.elemental,
   /// the insertion point of any generated code is inside hlfir.elemental.
   virtual hlfir::Entity
   genFinalResult(const llvm::SmallVectorImpl<mlir::Value> &reductionResults) {
     assert(reductionResults.size() == 1 &&
            "default implementation of genFinalResult expect a single reduction "
            "value");
     return hlfir::Entity{reductionResults[0]};
   }

   /// Return mlir::success(), if the operation can be converted.
   /// The default implementation always returns mlir::success().
   /// The derived type may override the default implementation
   /// with its own definition.
   virtual mlir::LogicalResult isConvertible() const { return mlir::success(); }

   // Default implementation of isTotalReduction() just checks
   // if the result of the operation is a scalar.
   // True result indicates that the reduction has to be done
   // across all elements, false result indicates that
   // the result is an array expression produced by an hlfir.elemental
   // operation with a single reduction loop across the DIM dimension.
   //
   // MAXLOC/MINLOC must override this.
   virtual bool isTotalReduction() const { return getResultRank() == 0; }

   // Return true, if the reduction loop[-nest] may be unordered.
   // In general, FP reductions may only be unordered when
   // FastMathFlags::reassoc transformations are allowed.
   //
   // Some dervied types may need to override this.
   virtual bool isUnordered() const {
     mlir::Type elemType = getSourceElementType();
     if (mlir::isa<mlir::IntegerType, fir::LogicalType, fir::CharacterType>(
             elemType))
       return true;
     return static_cast<bool>(getFastMath() &
                              mlir::arith::FastMathFlags::reassoc);
   }

   /// Return 0, if DIM is not present or its values does not matter
   /// (for example, a reduction of 1D array does not care about
   /// the DIM value, assuming that it is a valid program).
   /// Return mlir::failure(), if DIM is a constant known
   /// to be invalid for the given array.
   /// Otherwise, return DIM constant value.
   mlir::FailureOr<int64_t> getConstDim() const {
     int64_t dimVal = 0;
     if (!isTotalReduction()) {
       // In case of partial reduction we should ignore the operations
       // with invalid DIM values. They may appear in dead code
       // after constant propagation.
       auto constDim = fir::getIntIfConstant(getDim());
       if (!constDim)
         return rewriter.notifyMatchFailure(op, "Nonconstant DIM");
       dimVal = *constDim;

       if ((dimVal <= 0 || dimVal > getSourceRank()))
         return rewriter.notifyMatchFailure(op,
                                            "Invalid DIM for partial reduction");
     }
     return dimVal;
   }

   /// Return hlfir::Entity of the result.
   hlfir::Entity getResultEntity() const {
     return hlfir::Entity{op->getResult(0)};
   }

   /// Return type of the result (e.g. !hlfir.expr<?xi32>).
   mlir::Type getResultType() const { return getResultEntity().getType(); }

   /// Return the element type of the result (e.g. i32).
   mlir::Type getResultElementType() const {
     return hlfir::getFortranElementType(getResultType());
   }

   /// Return rank of the result.
   unsigned getResultRank() const { return getResultEntity().getRank(); }

   /// Return the element type of the source.
   mlir::Type getSourceElementType() const {
     return hlfir::getFortranElementType(getSource().getType());
   }

   /// Return rank of the input array.
   unsigned getSourceRank() const {
     return hlfir::Entity{getSource()}.getRank();
   }

   /// The reduction operation.
   mlir::Operation *op;

   mlir::PatternRewriter &rewriter;
   mlir::Location loc;
   fir::FirOpBuilder builder;
 };

 /// Generate initialization value for MIN or MAX reduction
 /// of the given \p type.
 template <bool IS_MAX>
 static mlir::Value genMinMaxInitValue(mlir::Location loc,
                                       fir::FirOpBuilder &builder,
                                       mlir::Type type) {
   if (auto ty = mlir::dyn_cast<mlir::FloatType>(type)) {
     const llvm::fltSemantics &sem = ty.getFloatSemantics();
     // We must not use +/-INF here. If the reduction input is empty,
     // the result of reduction must be +/-LARGEST.
     llvm::APFloat limit = llvm::APFloat::getLargest(sem, /*Negative=*/IS_MAX);
     return builder.createRealConstant(loc, type, limit);
   }
   unsigned bits = type.getIntOrFloatBitWidth();
   int64_t limitInt = IS_MAX
                          ? llvm::APInt::getSignedMinValue(bits).getSExtValue()
                          : llvm::APInt::getSignedMaxValue(bits).getSExtValue();
   return builder.createIntegerConstant(loc, type, limitInt);
 }

 /// Generate a comparison of an array element value \p elem
 /// and the current reduction value \p reduction for MIN/MAX reduction.
 template <bool IS_MAX>
 static mlir::Value
 genMinMaxComparison(mlir::Location loc, fir::FirOpBuilder &builder,
                     mlir::Value elem, mlir::Value reduction) {
   if (mlir::isa<mlir::FloatType>(reduction.getType())) {
     // For FP reductions we want the first smallest value to be used, that
     // is not NaN. A OGL/OLT condition will usually work for this unless all
     // the values are Nan or Inf. This follows the same logic as
     // NumericCompare for Minloc/Maxloc in extrema.cpp.
     mlir::Value cmp =
         mlir::arith::CmpFOp::create(builder, loc,
                                     IS_MAX ? mlir::arith::CmpFPredicate::OGT
                                            : mlir::arith::CmpFPredicate::OLT,
                                     elem, reduction);
     mlir::Value cmpNan = mlir::arith::CmpFOp::create(
         builder, loc, mlir::arith::CmpFPredicate::UNE, reduction, reduction);
     mlir::Value cmpNan2 = mlir::arith::CmpFOp::create(
         builder, loc, mlir::arith::CmpFPredicate::OEQ, elem, elem);
     cmpNan = mlir::arith::AndIOp::create(builder, loc, cmpNan, cmpNan2);
     return mlir::arith::OrIOp::create(builder, loc, cmp, cmpNan);
   } else if (mlir::isa<mlir::IntegerType>(reduction.getType())) {
     return mlir::arith::CmpIOp::create(builder, loc,
                                        IS_MAX ? mlir::arith::CmpIPredicate::sgt
                                               : mlir::arith::CmpIPredicate::slt,
                                        elem, reduction);
   }
   llvm_unreachable("unsupported type");
 }

 // Generate a predicate value indicating that an array with the given
 // extents is not empty.
 static mlir::Value
 genIsNotEmptyArrayExtents(mlir::Location loc, fir::FirOpBuilder &builder,
                           const llvm::SmallVectorImpl<mlir::Value> &extents) {
   mlir::Value isNotEmpty = builder.createBool(loc, true);
   for (auto extent : extents) {
     mlir::Value zero =
         fir::factory::createZeroValue(builder, loc, extent.getType());
     mlir::Value cmp = mlir::arith::CmpIOp::create(
         builder, loc, mlir::arith::CmpIPredicate::ne, extent, zero);
     isNotEmpty = mlir::arith::AndIOp::create(builder, loc, isNotEmpty, cmp);
   }
   return isNotEmpty;
 }

 // Helper method for MIN/MAX LOC/VAL reductions.
 // It returns a vector of indices such that they address
 // the first element of an array (in case of total reduction)
 // or its section (in case of partial reduction).
 //
 // If case of total reduction oneBasedIndices must be empty,
 // otherwise, they contain the one based indices of the wrapping
 // hlfir.elemental.
 // Basically, the method adds the necessary number of constant-one
 // indices into oneBasedIndices.
 static llvm::SmallVector<mlir::Value> genFirstElementIndicesForReduction(
     mlir::Location loc, fir::FirOpBuilder &builder, bool isTotalReduction,
     mlir::FailureOr<int64_t> dim, unsigned rank,
     mlir::ValueRange oneBasedIndices) {
   llvm::SmallVector<mlir::Value> indices{oneBasedIndices};
   mlir::Value one =
       builder.createIntegerConstant(loc, builder.getIndexType(), 1);
   if (isTotalReduction) {
     assert(oneBasedIndices.size() == 0 &&
            "wrong number of indices for total reduction");
     // Set indices to all-ones.
     indices.append(rank, one);
   } else {
     assert(oneBasedIndices.size() == rank - 1 &&
            "there must be RANK-1 indices for partial reduction");
     assert(mlir::succeeded(dim) && "partial reduction with invalid DIM");
     // Insert constant-one index at DIM dimension.
     indices.insert(indices.begin() + *dim - 1, one);
   }
   return indices;
 }

 /// Implementation of ReductionAsElementalConverter interface
 /// for MAXLOC/MINLOC.
 template <typename T>
 class MinMaxlocAsElementalConverter : public ReductionAsElementalConverter {
   static_assert(std::is_same_v<T, hlfir::MaxlocOp> ||
                 std::is_same_v<T, hlfir::MinlocOp>);
   static constexpr unsigned maxRank = Fortran::common::maxRank;
   // We have the following reduction values in the reduction loop:
   //   * N integer coordinates, where N is:
   //     - RANK(ARRAY) for total reductions.
   //     - 1 for partial reductions.
   //   * 1 reduction value holding the current MIN/MAX.
   //   * 1 boolean indicating whether it is the first time
   //     the mask is true.
   //
   // If useIsFirst() returns false, then the boolean loop-carried
   // value is not used.
   static constexpr unsigned maxNumReductions = Fortran::common::maxRank + 2;
   static constexpr bool isMax = std::is_same_v<T, hlfir::MaxlocOp>;
   using Base = ReductionAsElementalConverter;

 public:
   MinMaxlocAsElementalConverter(T op, mlir::PatternRewriter &rewriter)
       : Base{op.getOperation(), rewriter} {}

 private:
   virtual mlir::Value getSource() const final { return getOp().getArray(); }
   virtual mlir::Value getDim() const final { return getOp().getDim(); }
   virtual mlir::Value getMask() const final { return getOp().getMask(); }
   virtual mlir::arith::FastMathFlags getFastMath() const final {
     return getOp().getFastmath();
   }

   virtual mlir::LogicalResult isConvertible() const final {
     if (getOp().getBack())
       return rewriter.notifyMatchFailure(
           getOp(), "BACK is not supported for MINLOC/MAXLOC inlining");
     if (mlir::isa<fir::CharacterType>(getSourceElementType()))
       return rewriter.notifyMatchFailure(
           getOp(),
           "CHARACTER type is not supported for MINLOC/MAXLOC inlining");
     return mlir::success();
   }

   // If the result is scalar, then DIM does not matter,
   // and this is a total reduction.
   // If DIM is not present, this is a total reduction.
   virtual bool isTotalReduction() const final {
     return getResultRank() == 0 || !getDim();
   }

   virtual llvm::SmallVector<mlir::Value> genReductionInitValues(
       mlir::ValueRange oneBasedIndices,
       const llvm::SmallVectorImpl<mlir::Value> &extents) final;
   virtual llvm::SmallVector<mlir::Value>
   reduceOneElement(const llvm::SmallVectorImpl<mlir::Value> &currentValue,
                    hlfir::Entity array, mlir::ValueRange oneBasedIndices) final;
   virtual hlfir::Entity genFinalResult(
       const llvm::SmallVectorImpl<mlir::Value> &reductionResults) final;

 private:
   T getOp() const { return mlir::cast<T>(op); }

   unsigned getNumCoors() const {
     return isTotalReduction() ? getSourceRank() : 1;
   }

   void
   checkReductions(const llvm::SmallVectorImpl<mlir::Value> &reductions) const {
     if (!useIsFirst())
       assert(reductions.size() == getNumCoors() + 1 &&
              "invalid number of reductions for MINLOC/MAXLOC");
     else
       assert(reductions.size() == getNumCoors() + 2 &&
              "invalid number of reductions for MINLOC/MAXLOC");
   }

   mlir::Value
   getCurrentMinMax(const llvm::SmallVectorImpl<mlir::Value> &reductions) const {
     checkReductions(reductions);
     return reductions[getNumCoors()];
   }

   mlir::Value
   getIsFirst(const llvm::SmallVectorImpl<mlir::Value> &reductions) const {
     checkReductions(reductions);
     assert(useIsFirst() && "IsFirst predicate must not be used");
     return reductions[getNumCoors() + 1];
   }

   // Return true iff the input can contain NaNs, and they should be
   // honored, such that all-NaNs input must produce the location
   // of the first unmasked NaN.
   bool honorNans() const {
     return !static_cast<bool>(getFastMath() & mlir::arith::FastMathFlags::nnan);
   }

   // Return true iff we have to use the loop-carried IsFirst predicate.
   // If there is no mask, we can initialize the reductions using
   // the first elements of the input.
   // If NaNs are not honored, we can initialize the starting MIN/MAX
   // value to +/-LARGEST; the coordinates are guaranteed to be updated
   // properly for non-empty input without NaNs.
   bool useIsFirst() const { return getMask() && honorNans(); }
 };

 template <typename T>
 llvm::SmallVector<mlir::Value>
 MinMaxlocAsElementalConverter<T>::genReductionInitValues(
     mlir::ValueRange oneBasedIndices,
     const llvm::SmallVectorImpl<mlir::Value> &extents) {
   fir::IfOp ifOp;
   if (!useIsFirst() && honorNans()) {
     // Check if we can load the value of the first element in the array
     // or its section (for partial reduction).
     assert(!getMask() && "cannot fetch first element when mask is present");
     assert(extents.size() == getNumCoors() &&
            "wrong number of extents for MINLOC/MAXLOC reduction");
     mlir::Value isNotEmpty = genIsNotEmptyArrayExtents(loc, builder, extents);

     llvm::SmallVector<mlir::Value> indices = genFirstElementIndicesForReduction(
         loc, builder, isTotalReduction(), getConstDim(), getSourceRank(),
         oneBasedIndices);

     llvm::SmallVector<mlir::Type> ifTypes(getNumCoors(),
                                           getResultElementType());
     ifTypes.push_back(getSourceElementType());
     ifOp = fir::IfOp::create(builder, loc, ifTypes, isNotEmpty,
                              /*withElseRegion=*/true);
     builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
     mlir::Value one =
         builder.createIntegerConstant(loc, getResultElementType(), 1);
     llvm::SmallVector<mlir::Value> results(getNumCoors(), one);
     mlir::Value minMaxFirst =
         hlfir::loadElementAt(loc, builder, hlfir::Entity{getSource()}, indices);
     results.push_back(minMaxFirst);
     fir::ResultOp::create(builder, loc, results);

     // In the 'else' block use default init values.
     builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
   }

   // Initial value for the coordinate(s) is zero.
   mlir::Value zeroCoor =
       fir::factory::createZeroValue(builder, loc, getResultElementType());
   llvm::SmallVector<mlir::Value> result(getNumCoors(), zeroCoor);

   // Initial value for the MIN/MAX value.
   mlir::Value minMaxInit =
       genMinMaxInitValue<isMax>(loc, builder, getSourceElementType());
   result.push_back(minMaxInit);

   if (ifOp) {
     fir::ResultOp::create(builder, loc, result);
     builder.setInsertionPointAfter(ifOp);
     result = ifOp.getResults();
   } else if (useIsFirst()) {
     // Initial value for isFirst predicate. It is switched to false,
     // when the reduction update dynamically happens inside the reduction
     // loop.
     mlir::Value trueVal = builder.createBool(loc, true);
     result.push_back(trueVal);
   }

   return result;
 }

 template <typename T>
 llvm::SmallVector<mlir::Value>
 MinMaxlocAsElementalConverter<T>::reduceOneElement(
     const llvm::SmallVectorImpl<mlir::Value> &currentValue, hlfir::Entity array,
     mlir::ValueRange oneBasedIndices) {
   checkReductions(currentValue);
   hlfir::Entity elementValue =
       hlfir::loadElementAt(loc, builder, array, oneBasedIndices);
   mlir::Value cmp = genMinMaxComparison<isMax>(loc, builder, elementValue,
                                                getCurrentMinMax(currentValue));
   if (useIsFirst()) {
     // If isFirst is true, then do the reduction update regardless
     // of the FP comparison.
     cmp =
         mlir::arith::OrIOp::create(builder, loc, cmp, getIsFirst(currentValue));
   }

   llvm::SmallVector<mlir::Value> newIndices;
   int64_t dim = 1;
   if (!isTotalReduction()) {
     auto dimVal = getConstDim();
     assert(mlir::succeeded(dimVal) &&
            "partial MINLOC/MAXLOC reduction with invalid DIM");
     dim = *dimVal;
     assert(getNumCoors() == 1 &&
            "partial MAXLOC/MINLOC reduction must compute one coordinate");
   }

   for (unsigned coorIdx = 0; coorIdx < getNumCoors(); ++coorIdx) {
     mlir::Value currentCoor = currentValue[coorIdx];
     mlir::Value newCoor = builder.createConvert(
         loc, currentCoor.getType(), oneBasedIndices[coorIdx + dim - 1]);
     mlir::Value update =
         mlir::arith::SelectOp::create(builder, loc, cmp, newCoor, currentCoor);
     newIndices.push_back(update);
   }

   mlir::Value newMinMax = mlir::arith::SelectOp::create(
       builder, loc, cmp, elementValue, getCurrentMinMax(currentValue));
   newIndices.push_back(newMinMax);

   if (useIsFirst()) {
     mlir::Value newIsFirst = builder.createBool(loc, false);
     newIndices.push_back(newIsFirst);
   }

   assert(currentValue.size() == newIndices.size() &&
          "invalid number of updated reductions");

   return newIndices;
 }

 template <typename T>
 hlfir::Entity MinMaxlocAsElementalConverter<T>::genFinalResult(
     const llvm::SmallVectorImpl<mlir::Value> &reductionResults) {
   // Identification of the final result of MINLOC/MAXLOC:
   //   * If DIM is absent, the result is rank-one array.
   //   * If DIM is present:
   //     - The result is scalar for rank-one input.
   //     - The result is an array of rank RANK(ARRAY)-1.
   checkReductions(reductionResults);

   // 16.9.137 & 16.9.143:
   // The subscripts returned by MINLOC/MAXLOC are in the range
   // 1 to the extent of the corresponding dimension.
   mlir::Type indexType = builder.getIndexType();

   // For partial reductions, the final result of the reduction
   // loop is just a scalar - the coordinate within DIM dimension.
   if (getResultRank() == 0 || !isTotalReduction()) {
     // The result is a scalar, so just return the scalar.
     assert(getNumCoors() == 1 &&
            "unpexpected number of coordinates for scalar result");
     return hlfir::Entity{reductionResults[0]};
   }
   // This is a total reduction, and there is no wrapping hlfir.elemental.
   // We have to pack the reduced coordinates into a rank-one array.
   unsigned rank = getSourceRank();
   // TODO: in order to avoid introducing new memory effects
   // we should not use a temporary in memory.
   // We can use hlfir.elemental with a switch to pack all the coordinates
   // into an array expression, or we can have a dedicated HLFIR operation
   // for this.
   mlir::Value tempArray = builder.createTemporary(
       loc, fir::SequenceType::get(rank, getResultElementType()));
   for (unsigned i = 0; i < rank; ++i) {
     mlir::Value coor = reductionResults[i];
     mlir::Value idx = builder.createIntegerConstant(loc, indexType, i + 1);
     mlir::Value resultElement =
         hlfir::getElementAt(loc, builder, hlfir::Entity{tempArray}, {idx});
     hlfir::AssignOp::create(builder, loc, coor, resultElement);
   }
   mlir::Value tempExpr = hlfir::AsExprOp::create(
       builder, loc, tempArray, builder.createBool(loc, false));
   return hlfir::Entity{tempExpr};
 }

 /// Base class for numeric reductions like MAXVAl, MINVAL, SUM.
 template <typename OpT>
 class NumericReductionAsElementalConverterBase
     : public ReductionAsElementalConverter {
   using Base = ReductionAsElementalConverter;

 protected:
   NumericReductionAsElementalConverterBase(OpT op,
                                            mlir::PatternRewriter &rewriter)
       : Base{op.getOperation(), rewriter} {}

   virtual mlir::Value getSource() const final { return getOp().getArray(); }
   virtual mlir::Value getDim() const final { return getOp().getDim(); }
   virtual mlir::Value getMask() const final { return getOp().getMask(); }
   virtual mlir::arith::FastMathFlags getFastMath() const final {
     return getOp().getFastmath();
   }

   OpT getOp() const { return mlir::cast<OpT>(op); }

   void checkReductions(const llvm::SmallVectorImpl<mlir::Value> &reductions) {
     assert(reductions.size() == 1 && "reduction must produce single value");
   }
 };

 /// Reduction converter for MAXMAL/MINVAL.
 template <typename T>
 class MinMaxvalAsElementalConverter
     : public NumericReductionAsElementalConverterBase<T> {
   static_assert(std::is_same_v<T, hlfir::MaxvalOp> ||
                 std::is_same_v<T, hlfir::MinvalOp>);
   // We have two reduction values:
   //   * The current MIN/MAX value.
   //   * 1 boolean indicating whether it is the first time
   //     the mask is true.
   //
   // The boolean flag is used to replace the initial value
   // with the first input element even if it is NaN.
   // If useIsFirst() returns false, then the boolean loop-carried
   // value is not used.
   static constexpr bool isMax = std::is_same_v<T, hlfir::MaxvalOp>;
   using Base = NumericReductionAsElementalConverterBase<T>;

 public:
   MinMaxvalAsElementalConverter(T op, mlir::PatternRewriter &rewriter)
       : Base{op, rewriter} {}

 private:
   virtual mlir::LogicalResult isConvertible() const final {
     if (mlir::isa<fir::CharacterType>(this->getSourceElementType()))
       return this->rewriter.notifyMatchFailure(
           this->getOp(),
           "CHARACTER type is not supported for MINVAL/MAXVAL inlining");
     return mlir::success();
   }

   virtual llvm::SmallVector<mlir::Value> genReductionInitValues(
       mlir::ValueRange oneBasedIndices,
       const llvm::SmallVectorImpl<mlir::Value> &extents) final;

   virtual llvm::SmallVector<mlir::Value>
   reduceOneElement(const llvm::SmallVectorImpl<mlir::Value> &currentValue,
                    hlfir::Entity array,
                    mlir::ValueRange oneBasedIndices) final {
     this->checkReductions(currentValue);
     llvm::SmallVector<mlir::Value> result;
     fir::FirOpBuilder &builder = this->builder;
     mlir::Location loc = this->loc;
     hlfir::Entity elementValue =
         hlfir::loadElementAt(loc, builder, array, oneBasedIndices);
     mlir::Value currentMinMax = getCurrentMinMax(currentValue);
     mlir::Value cmp =
         genMinMaxComparison<isMax>(loc, builder, elementValue, currentMinMax);
     if (useIsFirst())
       cmp = mlir::arith::OrIOp::create(builder, loc, cmp,
                                        getIsFirst(currentValue));
     mlir::Value newMinMax = mlir::arith::SelectOp::create(
         builder, loc, cmp, elementValue, currentMinMax);
     result.push_back(newMinMax);
     if (useIsFirst())
       result.push_back(builder.createBool(loc, false));
     return result;
   }

   virtual hlfir::Entity genFinalResult(
       const llvm::SmallVectorImpl<mlir::Value> &reductionResults) final {
     this->checkReductions(reductionResults);
     return hlfir::Entity{getCurrentMinMax(reductionResults)};
   }

   void
   checkReductions(const llvm::SmallVectorImpl<mlir::Value> &reductions) const {
     assert(reductions.size() == getNumReductions() &&
            "invalid number of reductions for MINVAL/MAXVAL");
   }

   mlir::Value
   getCurrentMinMax(const llvm::SmallVectorImpl<mlir::Value> &reductions) const {
     this->checkReductions(reductions);
     return reductions[0];
   }

   mlir::Value
   getIsFirst(const llvm::SmallVectorImpl<mlir::Value> &reductions) const {
     this->checkReductions(reductions);
     assert(useIsFirst() && "IsFirst predicate must not be used");
     return reductions[1];
   }

   // Return true iff the input can contain NaNs, and they should be
   // honored, such that all-NaNs input must produce NaN result.
   bool honorNans() const {
     return !static_cast<bool>(this->getFastMath() &
                               mlir::arith::FastMathFlags::nnan);
   }

   // Return true iff we have to use the loop-carried IsFirst predicate.
   // If there is no mask, we can initialize the reductions using
   // the first elements of the input.
   // If NaNs are not honored, we can initialize the starting MIN/MAX
   // value to +/-LARGEST.
   bool useIsFirst() const { return this->getMask() && honorNans(); }

   std::size_t getNumReductions() const { return useIsFirst() ? 2 : 1; }
 };

 template <typename T>
 llvm::SmallVector<mlir::Value>
 MinMaxvalAsElementalConverter<T>::genReductionInitValues(
     mlir::ValueRange oneBasedIndices,
     const llvm::SmallVectorImpl<mlir::Value> &extents) {
   llvm::SmallVector<mlir::Value> result;
   fir::FirOpBuilder &builder = this->builder;
   mlir::Location loc = this->loc;

   fir::IfOp ifOp;
   if (!useIsFirst() && honorNans()) {
     // Check if we can load the value of the first element in the array
     // or its section (for partial reduction).
     assert(!this->getMask() &&
            "cannot fetch first element when mask is present");
     assert(extents.size() ==
                (this->isTotalReduction() ? this->getSourceRank() : 1u) &&
            "wrong number of extents for MINVAL/MAXVAL reduction");
     mlir::Value isNotEmpty = genIsNotEmptyArrayExtents(loc, builder, extents);
     llvm::SmallVector<mlir::Value> indices = genFirstElementIndicesForReduction(
         loc, builder, this->isTotalReduction(), this->getConstDim(),
         this->getSourceRank(), oneBasedIndices);

     ifOp = fir::IfOp::create(builder, loc, this->getResultElementType(),
                              isNotEmpty,
                              /*withElseRegion=*/true);
     builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
     mlir::Value minMaxFirst = hlfir::loadElementAt(
         loc, builder, hlfir::Entity{this->getSource()}, indices);
     fir::ResultOp::create(builder, loc, minMaxFirst);

     // In the 'else' block use default init values.
     builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
   }

   mlir::Value init =
       genMinMaxInitValue<isMax>(loc, builder, this->getResultElementType());
   result.push_back(init);

   if (ifOp) {
     fir::ResultOp::create(builder, loc, result);
     builder.setInsertionPointAfter(ifOp);
     result = ifOp.getResults();
   } else if (useIsFirst()) {
     // Initial value for isFirst predicate. It is switched to false,
     // when the reduction update dynamically happens inside the reduction
     // loop.
     result.push_back(builder.createBool(loc, true));
   }

   return result;
 }

 /// Reduction converter for SUM.
 class SumAsElementalConverter
     : public NumericReductionAsElementalConverterBase<hlfir::SumOp> {
   using Base = NumericReductionAsElementalConverterBase;

 public:
   SumAsElementalConverter(hlfir::SumOp op, mlir::PatternRewriter &rewriter)
       : Base{op, rewriter} {}

 private:
   virtual llvm::SmallVector<mlir::Value> genReductionInitValues(
       [[maybe_unused]] mlir::ValueRange oneBasedIndices,
       [[maybe_unused]] const llvm::SmallVectorImpl<mlir::Value> &extents)
       final {
     return {
         fir::factory::createZeroValue(builder, loc, getResultElementType())};
   }
   virtual llvm::SmallVector<mlir::Value>
   reduceOneElement(const llvm::SmallVectorImpl<mlir::Value> &currentValue,
                    hlfir::Entity array,
                    mlir::ValueRange oneBasedIndices) final {
     checkReductions(currentValue);
     hlfir::Entity elementValue =
         hlfir::loadElementAt(loc, builder, array, oneBasedIndices);
     // NOTE: we can use "Kahan summation" same way as the runtime
     // (e.g. when fast-math is not allowed), but let's start with
     // the simple version.
     return {genScalarAdd(currentValue[0], elementValue)};
   }

   // Generate scalar addition of the two values (of the same data type).
   mlir::Value genScalarAdd(mlir::Value value1, mlir::Value value2);
 };

 /// Reduction converter for Product.
 class ProductAsElementalConverter
     : public NumericReductionAsElementalConverterBase<hlfir::ProductOp> {
   using Base = NumericReductionAsElementalConverterBase;

 public:
   ProductAsElementalConverter(hlfir::ProductOp op,
                               mlir::PatternRewriter &rewriter)
       : Base{op, rewriter} {}

 private:
   virtual llvm::SmallVector<mlir::Value> genReductionInitValues(
       [[maybe_unused]] mlir::ValueRange oneBasedIndices,
       [[maybe_unused]] const llvm::SmallVectorImpl<mlir::Value> &extents)
       final {
     return {fir::factory::createOneValue(builder, loc, getResultElementType())};
   }
   virtual llvm::SmallVector<mlir::Value>
   reduceOneElement(const llvm::SmallVectorImpl<mlir::Value> &currentValue,
                    hlfir::Entity array,
                    mlir::ValueRange oneBasedIndices) final {
     checkReductions(currentValue);
     hlfir::Entity elementValue =
         hlfir::loadElementAt(loc, builder, array, oneBasedIndices);
     return {genScalarMult(currentValue[0], elementValue)};
   }

   // Generate scalar multiplication of the two values (of the same data type).
   mlir::Value genScalarMult(mlir::Value value1, mlir::Value value2);
 };

 /// Base class for logical reductions like ALL, ANY, COUNT.
 /// They do not have MASK and FastMathFlags.
 template <typename OpT>
 class LogicalReductionAsElementalConverterBase
     : public ReductionAsElementalConverter {
   using Base = ReductionAsElementalConverter;

 public:
   LogicalReductionAsElementalConverterBase(OpT op,
                                            mlir::PatternRewriter &rewriter)
       : Base{op.getOperation(), rewriter} {}

 protected:
   OpT getOp() const { return mlir::cast<OpT>(op); }

   void checkReductions(const llvm::SmallVectorImpl<mlir::Value> &reductions) {
     assert(reductions.size() == 1 && "reduction must produce single value");
   }

   virtual mlir::Value getSource() const final { return getOp().getMask(); }
   virtual mlir::Value getDim() const final { return getOp().getDim(); }

   virtual hlfir::Entity genFinalResult(
       const llvm::SmallVectorImpl<mlir::Value> &reductionResults) override {
     checkReductions(reductionResults);
     return hlfir::Entity{reductionResults[0]};
   }
 };

 /// Reduction converter for ALL/ANY.
 template <typename T>
 class AllAnyAsElementalConverter
     : public LogicalReductionAsElementalConverterBase<T> {
   static_assert(std::is_same_v<T, hlfir::AllOp> ||
                 std::is_same_v<T, hlfir::AnyOp>);
   static constexpr bool isAll = std::is_same_v<T, hlfir::AllOp>;
   using Base = LogicalReductionAsElementalConverterBase<T>;

 public:
   AllAnyAsElementalConverter(T op, mlir::PatternRewriter &rewriter)
       : Base{op, rewriter} {}

 private:
   virtual llvm::SmallVector<mlir::Value> genReductionInitValues(
       [[maybe_unused]] mlir::ValueRange oneBasedIndices,
       [[maybe_unused]] const llvm::SmallVectorImpl<mlir::Value> &extents)
       final {
     return {this->builder.createBool(this->loc, isAll ? true : false)};
   }
   virtual llvm::SmallVector<mlir::Value>
   reduceOneElement(const llvm::SmallVectorImpl<mlir::Value> &currentValue,
                    hlfir::Entity array,
                    mlir::ValueRange oneBasedIndices) final {
     this->checkReductions(currentValue);
     fir::FirOpBuilder &builder = this->builder;
     mlir::Location loc = this->loc;
     hlfir::Entity elementValue =
         hlfir::loadElementAt(loc, builder, array, oneBasedIndices);
     mlir::Value mask =
         builder.createConvert(loc, builder.getI1Type(), elementValue);
     if constexpr (isAll)
       return {mlir::arith::AndIOp::create(builder, loc, mask, currentValue[0])};
     else
       return {mlir::arith::OrIOp::create(builder, loc, mask, currentValue[0])};
   }

   virtual hlfir::Entity genFinalResult(
       const llvm::SmallVectorImpl<mlir::Value> &reductionValues) final {
     this->checkReductions(reductionValues);
     return hlfir::Entity{this->builder.createConvert(
         this->loc, this->getResultElementType(), reductionValues[0])};
   }
 };

 /// Reduction converter for COUNT.
 class CountAsElementalConverter
     : public LogicalReductionAsElementalConverterBase<hlfir::CountOp> {
   using Base = LogicalReductionAsElementalConverterBase<hlfir::CountOp>;

 public:
   CountAsElementalConverter(hlfir::CountOp op, mlir::PatternRewriter &rewriter)
       : Base{op, rewriter} {}

 private:
   virtual llvm::SmallVector<mlir::Value> genReductionInitValues(
       [[maybe_unused]] mlir::ValueRange oneBasedIndices,
       [[maybe_unused]] const llvm::SmallVectorImpl<mlir::Value> &extents)
       final {
     return {
         fir::factory::createZeroValue(builder, loc, getResultElementType())};
   }
   virtual llvm::SmallVector<mlir::Value>
   reduceOneElement(const llvm::SmallVectorImpl<mlir::Value> &currentValue,
                    hlfir::Entity array,
                    mlir::ValueRange oneBasedIndices) final {
     checkReductions(currentValue);
     hlfir::Entity elementValue =
         hlfir::loadElementAt(loc, builder, array, oneBasedIndices);
     mlir::Value cond =
         builder.createConvert(loc, builder.getI1Type(), elementValue);
     mlir::Value one =
         builder.createIntegerConstant(loc, getResultElementType(), 1);
     mlir::Value add1 =
         mlir::arith::AddIOp::create(builder, loc, currentValue[0], one);
     return {mlir::arith::SelectOp::create(builder, loc, cond, add1,
                                           currentValue[0])};
   }
 };

 mlir::LogicalResult ReductionAsElementalConverter::convert() {
   mlir::LogicalResult canConvert(isConvertible());

   if (mlir::failed(canConvert))
     return canConvert;

   hlfir::Entity array = hlfir::Entity{getSource()};
   bool isTotalReduce = isTotalReduction();
   auto dimVal = getConstDim();
   if (mlir::failed(dimVal))
     return dimVal;
   mlir::Value mask = getMask();
   mlir::Value resultShape, dimExtent;
   llvm::SmallVector<mlir::Value> arrayExtents;
   if (isTotalReduce)
     arrayExtents = hlfir::genExtentsVector(loc, builder, array);
   else
     std::tie(resultShape, dimExtent) =
         genResultShapeForPartialReduction(array, *dimVal);

   // If the mask is present and is a scalar, then we'd better load its value
   // outside of the reduction loop making the loop unswitching easier.
   mlir::Value isPresentPred, maskValue;
   if (mask) {
     if (mlir::isa<fir::BaseBoxType>(mask.getType())) {
       // MASK represented by a box might be dynamically optional,
       // so we have to check for its presence before accessing it.
       isPresentPred =
           fir::IsPresentOp::create(builder, loc, builder.getI1Type(), mask);
     }

     if (hlfir::Entity{mask}.isScalar())
       maskValue = genMaskValue(mask, isPresentPred, {});
   }

   auto genKernel = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                        mlir::ValueRange inputIndices) -> hlfir::Entity {
     // Loop over all indices in the DIM dimension, and reduce all values.
     // If DIM is not present, do total reduction.

     llvm::SmallVector<mlir::Value> extents;
     if (isTotalReduce)
       extents = arrayExtents;
     else
       extents.push_back(
           builder.createConvert(loc, builder.getIndexType(), dimExtent));

     // Initial value for the reduction.
     llvm::SmallVector<mlir::Value, 1> reductionInitValues =
         genReductionInitValues(inputIndices, extents);

     auto genBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                        mlir::ValueRange oneBasedIndices,
                        mlir::ValueRange reductionArgs)
         -> llvm::SmallVector<mlir::Value, 1> {
       // Generate the reduction loop-nest body.
       // The initial reduction value in the innermost loop
       // is passed via reductionArgs[0].
       llvm::SmallVector<mlir::Value> indices;
       if (isTotalReduce) {
         indices = oneBasedIndices;
       } else {
         indices = inputIndices;
         indices.insert(indices.begin() + *dimVal - 1, oneBasedIndices[0]);
       }

       llvm::SmallVector<mlir::Value, 1> reductionValues = reductionArgs;
       llvm::SmallVector<mlir::Type, 1> reductionTypes;
       llvm::transform(reductionValues, std::back_inserter(reductionTypes),
                       [](mlir::Value v) { return v.getType(); });
       fir::IfOp ifOp;
       if (mask) {
         // Make the reduction value update conditional on the value
         // of the mask.
         if (!maskValue) {
           // If the mask is an array, use the elemental and the loop indices
           // to address the proper mask element.
           maskValue = genMaskValue(mask, isPresentPred, indices);
         }
         mlir::Value isUnmasked = fir::ConvertOp::create(
             builder, loc, builder.getI1Type(), maskValue);
         ifOp = fir::IfOp::create(builder, loc, reductionTypes, isUnmasked,
                                  /*withElseRegion=*/true);
         // In the 'else' block return the current reduction value.
         builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
         fir::ResultOp::create(builder, loc, reductionValues);

         // In the 'then' block do the actual addition.
         builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
       }
       reductionValues = reduceOneElement(reductionValues, array, indices);
       if (ifOp) {
         fir::ResultOp::create(builder, loc, reductionValues);
         builder.setInsertionPointAfter(ifOp);
         reductionValues = ifOp.getResults();
       }

       return reductionValues;
     };

     llvm::SmallVector<mlir::Value, 1> reductionFinalValues =
         hlfir::genLoopNestWithReductions(
             loc, builder, extents, reductionInitValues, genBody, isUnordered());
     return genFinalResult(reductionFinalValues);
   };

   if (isTotalReduce) {
     hlfir::Entity result = genKernel(loc, builder, mlir::ValueRange{});
     rewriter.replaceOp(op, result);
     return mlir::success();
   }

   hlfir::ElementalOp elementalOp = hlfir::genElementalOp(
       loc, builder, getResultElementType(), resultShape, /*typeParams=*/{},
       genKernel,
       /*isUnordered=*/true, /*polymorphicMold=*/nullptr, getResultType());

   // it wouldn't be safe to replace block arguments with a different
   // hlfir.expr type. Types can differ due to differing amounts of shape
   // information
   assert(elementalOp.getResult().getType() == op->getResult(0).getType());

   rewriter.replaceOp(op, elementalOp);
   return mlir::success();
 }

 std::tuple<mlir::Value, mlir::Value>
 ReductionAsElementalConverter::genResultShapeForPartialReduction(
     hlfir::Entity array, int64_t dimVal) {
   llvm::SmallVector<mlir::Value> inExtents =
       hlfir::genExtentsVector(loc, builder, array);
   assert(dimVal > 0 && dimVal <= static_cast<int64_t>(inExtents.size()) &&
          "DIM must be present and a positive constant not exceeding "
          "the array's rank");

   mlir::Value dimExtent = inExtents[dimVal - 1];
   inExtents.erase(inExtents.begin() + dimVal - 1);
   return {fir::ShapeOp::create(builder, loc, inExtents), dimExtent};
 }

 mlir::Value SumAsElementalConverter::genScalarAdd(mlir::Value value1,
                                                   mlir::Value value2) {
   mlir::Type ty = value1.getType();
   assert(ty == value2.getType() && "reduction values' types do not match");
   if (mlir::isa<mlir::FloatType>(ty))
     return mlir::arith::AddFOp::create(builder, loc, value1, value2);
   else if (mlir::isa<mlir::ComplexType>(ty))
     return fir::AddcOp::create(builder, loc, value1, value2);
   else if (mlir::isa<mlir::IntegerType>(ty))
     return mlir::arith::AddIOp::create(builder, loc, value1, value2);

   llvm_unreachable("unsupported SUM reduction type");
 }

 mlir::Value ProductAsElementalConverter::genScalarMult(mlir::Value value1,
                                                        mlir::Value value2) {
   mlir::Type ty = value1.getType();
   assert(ty == value2.getType() && "reduction values' types do not match");
   if (mlir::isa<mlir::FloatType>(ty))
     return mlir::arith::MulFOp::create(builder, loc, value1, value2);
   else if (mlir::isa<mlir::ComplexType>(ty))
     return fir::MulcOp::create(builder, loc, value1, value2);
   else if (mlir::isa<mlir::IntegerType>(ty))
     return mlir::arith::MulIOp::create(builder, loc, value1, value2);

   llvm_unreachable("unsupported MUL reduction type");
 }

 mlir::Value ReductionAsElementalConverter::genMaskValue(
     mlir::Value mask, mlir::Value isPresentPred, mlir::ValueRange indices) {
   mlir::OpBuilder::InsertionGuard guard(builder);
   fir::IfOp ifOp;
   mlir::Type maskType =
       hlfir::getFortranElementType(fir::unwrapPassByRefType(mask.getType()));
   if (isPresentPred) {
     ifOp = fir::IfOp::create(builder, loc, maskType, isPresentPred,
                              /*withElseRegion=*/true);

     // Use 'true', if the mask is not present.
     builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
     mlir::Value trueValue = builder.createBool(loc, true);
     trueValue = builder.createConvert(loc, maskType, trueValue);
     fir::ResultOp::create(builder, loc, trueValue);

     // Load the mask value, if the mask is present.
     builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
   }

   hlfir::Entity maskVar{mask};
   if (maskVar.isScalar()) {
     if (mlir::isa<fir::BaseBoxType>(mask.getType())) {
       // MASK may be a boxed scalar.
       mlir::Value addr = hlfir::genVariableRawAddress(loc, builder, maskVar);
       mask = fir::LoadOp::create(builder, loc, hlfir::Entity{addr});
     } else {
       mask = hlfir::loadTrivialScalar(loc, builder, maskVar);
     }
   } else {
     // Load from the mask array.
     assert(!indices.empty() && "no indices for addressing the mask array");
     maskVar = hlfir::getElementAt(loc, builder, maskVar, indices);
     mask = hlfir::loadTrivialScalar(loc, builder, maskVar);
   }

   if (!isPresentPred)
     return mask;

   fir::ResultOp::create(builder, loc, mask);
   return ifOp.getResult(0);
 }

 /// Convert an operation that is a partial or total reduction
 /// over an array of values into a reduction loop[-nest]
 /// optionally wrapped into hlfir.elemental.
 template <typename Op>
 class ReductionConversion : public mlir::OpRewritePattern<Op> {
 public:
   using mlir::OpRewritePattern<Op>::OpRewritePattern;

   llvm::LogicalResult
   matchAndRewrite(Op op, mlir::PatternRewriter &rewriter) const override {
     if constexpr (std::is_same_v<Op, hlfir::MaxlocOp> ||
                   std::is_same_v<Op, hlfir::MinlocOp>) {
       MinMaxlocAsElementalConverter<Op> converter(op, rewriter);
       return converter.convert();
     } else if constexpr (std::is_same_v<Op, hlfir::MaxvalOp> ||
                          std::is_same_v<Op, hlfir::MinvalOp>) {
       MinMaxvalAsElementalConverter<Op> converter(op, rewriter);
       return converter.convert();
     } else if constexpr (std::is_same_v<Op, hlfir::CountOp>) {
       CountAsElementalConverter converter(op, rewriter);
       return converter.convert();
     } else if constexpr (std::is_same_v<Op, hlfir::AllOp> ||
                          std::is_same_v<Op, hlfir::AnyOp>) {
       AllAnyAsElementalConverter<Op> converter(op, rewriter);
       return converter.convert();
     } else if constexpr (std::is_same_v<Op, hlfir::SumOp>) {
       SumAsElementalConverter converter{op, rewriter};
       return converter.convert();
     } else if constexpr (std::is_same_v<Op, hlfir::ProductOp>) {
       ProductAsElementalConverter converter{op, rewriter};
       return converter.convert();
     }
     return rewriter.notifyMatchFailure(op, "unexpected reduction operation");
   }
 };

 template <typename Op>
 class ArrayShiftConversion : public mlir::OpRewritePattern<Op> {
 public:
   // The implementation below only support CShiftOp and EOShiftOp.
   static_assert(std::is_same_v<Op, hlfir::CShiftOp> ||
                 std::is_same_v<Op, hlfir::EOShiftOp>);

   using mlir::OpRewritePattern<Op>::OpRewritePattern;

   llvm::LogicalResult
   matchAndRewrite(Op op, mlir::PatternRewriter &rewriter) const override {

     hlfir::ExprType expr = mlir::dyn_cast<hlfir::ExprType>(op.getType());
     assert(expr &&
            "expected an expression type for the result of the array shift");
     unsigned arrayRank = expr.getRank();
     // When it is a 1D CSHIFT/EOSHIFT, we may assume that the DIM argument
     // (whether it is present or absent) is equal to 1, otherwise,
     // the program is illegal.
     int64_t dimVal = 1;
     if (arrayRank != 1)
       if (mlir::Value dim = op.getDim()) {
         auto constDim = fir::getIntIfConstant(dim);
         if (!constDim)
           return rewriter.notifyMatchFailure(
               op, "Nonconstant DIM for CSHIFT/EOSHIFT");
         dimVal = *constDim;
       }

     if (dimVal <= 0 || dimVal > arrayRank)
       return rewriter.notifyMatchFailure(op, "Invalid DIM for CSHIFT/EOSHIFT");

     if constexpr (std::is_same_v<Op, hlfir::EOShiftOp>) {
       // TODO: the EOSHIFT inlining code is not ready to produce
       // fir.if selecting between ARRAY and BOUNDARY (or the default
       // boundary value), when they are expressions of type CHARACTER.
       // This needs more work.
       if (mlir::isa<fir::CharacterType>(expr.getEleTy())) {
         if (!hlfir::Entity{op.getArray()}.isVariable())
           return rewriter.notifyMatchFailure(
               op, "EOSHIFT with ARRAY being CHARACTER expression");
         if (op.getBoundary() && !hlfir::Entity{op.getBoundary()}.isVariable())
           return rewriter.notifyMatchFailure(
               op, "EOSHIFT with BOUNDARY being CHARACTER expression");
       }
       // TODO: selecting between ARRAY and BOUNDARY values with derived types
       // need more work.
       if (fir::isa_derived(expr.getEleTy()))
         return rewriter.notifyMatchFailure(op, "EOSHIFT of derived type");
     }

     // When DIM==1 and the contiguity of the input array is not statically
     // known, try to exploit the fact that the leading dimension might be
     // contiguous. We can do this now using hlfir.eval_in_mem with
     // a dynamic check for the leading dimension contiguity.
     // Otherwise, convert hlfir.cshift/eoshift to hlfir.elemental.
     //
     // Note that the hlfir.elemental can be inlined into other hlfir.elemental,
     // while hlfir.eval_in_mem prevents this, and we will end up creating
     // a temporary array for the result. We may need to come up with
     // a more sophisticated logic for picking the most efficient
     // representation.
     hlfir::Entity array = hlfir::Entity{op.getArray()};
     mlir::Type elementType = array.getFortranElementType();
     if (dimVal == 1 && fir::isa_trivial(elementType) &&
         // genInMemArrayShift() only works for variables currently.
         array.isVariable())
       rewriter.replaceOp(op, genInMemArrayShift(rewriter, op, dimVal));
     else
       rewriter.replaceOp(op, genElementalArrayShift(rewriter, op, dimVal));
     return mlir::success();
   }

 private:
   /// For CSHIFT, generate MODULO(\p shiftVal, \p extent).
   /// For EOSHIFT, return \p shiftVal casted to \p calcType.
   static mlir::Value normalizeShiftValue(mlir::Location loc,
                                          fir::FirOpBuilder &builder,
                                          mlir::Value shiftVal,
                                          mlir::Value extent,
                                          mlir::Type calcType) {
     shiftVal = builder.createConvert(loc, calcType, shiftVal);
     if constexpr (std::is_same_v<Op, hlfir::EOShiftOp>)
       return shiftVal;

     extent = builder.createConvert(loc, calcType, extent);
     // Make sure that we do not divide by zero. When the dimension
     // has zero size, turn the extent into 1. Note that the computed
     // MODULO value won't be used in this case, so it does not matter
     // which extent value we use.
     mlir::Value zero = builder.createIntegerConstant(loc, calcType, 0);
     mlir::Value one = builder.createIntegerConstant(loc, calcType, 1);
     mlir::Value isZero = mlir::arith::CmpIOp::create(
         builder, loc, mlir::arith::CmpIPredicate::eq, extent, zero);
     extent = mlir::arith::SelectOp::create(builder, loc, isZero, one, extent);
     shiftVal = fir::IntrinsicLibrary{builder, loc}.genModulo(
         calcType, {shiftVal, extent});
     return builder.createConvert(loc, calcType, shiftVal);
   }

   /// The indices computations for the array shifts are done using I64 type.
   /// For CSHIFT, all computations do not overflow signed and unsigned I64.
   /// For EOSHIFT, some computations may involve negative shift values,
   /// so using no-unsigned wrap flag would be incorrect.
   static void setArithOverflowFlags(Op op, fir::FirOpBuilder &builder) {
     if constexpr (std::is_same_v<Op, hlfir::EOShiftOp>)
       builder.setIntegerOverflowFlags(mlir::arith::IntegerOverflowFlags::nsw);
     else
       builder.setIntegerOverflowFlags(mlir::arith::IntegerOverflowFlags::nsw |
                                       mlir::arith::IntegerOverflowFlags::nuw);
   }

   /// Return the element type of the EOSHIFT boundary that may be omitted
   /// statically or dynamically. This element type might be used
   /// to generate MLIR where we have to select between the default
   /// boundary value and the dynamically absent/present boundary value.
   /// If the boundary has a type not defined in Table 16.4 in 16.9.77
   /// of F2023, then the return value is nullptr.
   static mlir::Type getDefaultBoundaryValueType(mlir::Type elementType) {
     // To be able to generate a "select" between the default boundary value
     // and the dynamic boundary value, use BoxCharType for the CHARACTER
     // cases. This might be a little bit inefficient, because we may
     // create unnecessary tuples, but it simplifies the inlining code.
     if (auto charTy = mlir::dyn_cast<fir::CharacterType>(elementType))
       return fir::BoxCharType::get(charTy.getContext(), charTy.getFKind());

     if (mlir::isa<fir::LogicalType>(elementType) ||
         fir::isa_integer(elementType) || fir::isa_real(elementType) ||
         fir::isa_complex(elementType))
       return elementType;

     return nullptr;
   }

   /// Generate the default boundary value as defined in Table 16.4 in 16.9.77
   /// of F2023.
   static mlir::Value genDefaultBoundary(mlir::Location loc,
                                         fir::FirOpBuilder &builder,
                                         mlir::Type elementType) {
     assert(getDefaultBoundaryValueType(elementType) &&
            "default boundary value cannot be computed for the given type");
     if (mlir::isa<fir::CharacterType>(elementType)) {
       // Create an empty CHARACTER of the same kind. The assignment
       // of this empty CHARACTER into the result will add the padding
       // if necessary.
       fir::factory::CharacterExprHelper charHelper{builder, loc};
       mlir::Value zeroLen = builder.createIntegerConstant(
           loc, builder.getCharacterLengthType(), 0);
       fir::CharBoxValue emptyCharTemp =
           charHelper.createCharacterTemp(elementType, zeroLen);
       return charHelper.createEmbox(emptyCharTemp);
     }

     return fir::factory::createZeroValue(builder, loc, elementType);
   }

   /// \p entity represents the boundary operand of hlfir.eoshift.
   /// This method generates a scalar boundary value fetched
   /// from the boundary entity using \p indices (which may be empty,
   /// if the boundary operand is scalar).
   static mlir::Value loadEoshiftVal(mlir::Location loc,
                                     fir::FirOpBuilder &builder,
                                     hlfir::Entity entity,
                                     mlir::ValueRange indices = {}) {
     hlfir::Entity boundaryVal =
         hlfir::loadElementAt(loc, builder, entity, indices);

     mlir::Type boundaryValTy =
         getDefaultBoundaryValueType(entity.getFortranElementType());

     // Boxed !fir.char<KIND,LEN> with known LEN are loaded
     // as raw references to !fir.char<KIND,LEN>.
     // We need to wrap them into the !fir.boxchar.
     if (boundaryVal.isVariable() && boundaryValTy &&
         mlir::isa<fir::BoxCharType>(boundaryValTy))
       return hlfir::genVariableBoxChar(loc, builder, boundaryVal);
     return boundaryVal;
   }

   /// This method generates a scalar boundary value for the given hlfir.eoshift
   /// \p op that can be used to initialize cells of the result
   /// if the scalar/array boundary operand is statically or dynamically
   /// absent. The first result is the scalar boundary value. The second result
   /// is a dynamic predicate indicating whether the scalar boundary value
   /// should actually be used.
   [[maybe_unused]] static std::pair<mlir::Value, mlir::Value>
   genScalarBoundaryForEOShift(mlir::Location loc, fir::FirOpBuilder &builder,
                               hlfir::EOShiftOp op) {
     hlfir::Entity array{op.getArray()};
     mlir::Type elementType = array.getFortranElementType();

     if (!op.getBoundary()) {
       // Boundary operand is statically absent.
       mlir::Value defaultVal = genDefaultBoundary(loc, builder, elementType);
       mlir::Value boundaryIsScalarPred = builder.createBool(loc, true);
       return {defaultVal, boundaryIsScalarPred};
     }

     hlfir::Entity boundary{op.getBoundary()};
     mlir::Type boundaryValTy = getDefaultBoundaryValueType(elementType);

     if (boundary.isScalar()) {
       if (!boundaryValTy || !boundary.mayBeOptional()) {
         // The boundary must be present.
         mlir::Value boundaryVal = loadEoshiftVal(loc, builder, boundary);
         mlir::Value boundaryIsScalarPred = builder.createBool(loc, true);
         return {boundaryVal, boundaryIsScalarPred};
       }

       // Boundary is a scalar that may be dynamically absent.
       // If boundary is not present dynamically, we must use the default
       // value.
       assert(mlir::isa<fir::BaseBoxType>(boundary.getType()));
       mlir::Value isPresentPred =
           fir::IsPresentOp::create(builder, loc, builder.getI1Type(), boundary);
       mlir::Value boundaryVal =
           builder
               .genIfOp(loc, {boundaryValTy}, isPresentPred,
                        /*withElseRegion=*/true)
               .genThen([&]() {
                 mlir::Value boundaryVal =
                     loadEoshiftVal(loc, builder, boundary);
                 fir::ResultOp::create(builder, loc, boundaryVal);
               })
               .genElse([&]() {
                 mlir::Value defaultVal =
                     genDefaultBoundary(loc, builder, elementType);
                 fir::ResultOp::create(builder, loc, defaultVal);
               })
               .getResults()[0];
       mlir::Value boundaryIsScalarPred = builder.createBool(loc, true);
       return {boundaryVal, boundaryIsScalarPred};
     }
     if (!boundaryValTy || !boundary.mayBeOptional()) {
       // The boundary must be present
       mlir::Value boundaryIsScalarPred = builder.createBool(loc, false);
       return {nullptr, boundaryIsScalarPred};
     }

     // Boundary is an array that may be dynamically absent.
     mlir::Value defaultVal = genDefaultBoundary(loc, builder, elementType);
     mlir::Value isPresentPred =
         fir::IsPresentOp::create(builder, loc, builder.getI1Type(), boundary);
     // If the array is present, then boundaryIsScalarPred must be equal
     // to false, otherwise, it should be true.
     mlir::Value trueVal = builder.createBool(loc, true);
     mlir::Value falseVal = builder.createBool(loc, false);
     mlir::Value boundaryIsScalarPred = mlir::arith::SelectOp::create(
         builder, loc, isPresentPred, falseVal, trueVal);
     return {defaultVal, boundaryIsScalarPred};
   }

   /// Generate code that produces the final boundary value to be assigned
   /// to the result of hlfir.eoshift \p op. \p precomputedScalarBoundary
   /// specifies the scalar boundary value pre-computed before the elemental
   /// or the assignment loop. If it is nullptr, then the boundary operand
   /// of \p op must be a present array. \p boundaryIsScalarPred is a dynamic
   /// predicate that is true, when the pre-computed scalar value must be used.
   /// \p oneBasedIndices specify the indices to address into the boundary
   /// array - they may be empty, if the boundary is scalar.
   [[maybe_unused]] static mlir::Value selectBoundaryValue(
       mlir::Location loc, fir::FirOpBuilder &builder, hlfir::EOShiftOp op,
       mlir::Value precomputedScalarBoundary, mlir::Value boundaryIsScalarPred,
       mlir::ValueRange oneBasedIndices) {
     // Boundary is statically absent: a default value has been precomputed.
     if (!op.getBoundary())
       return precomputedScalarBoundary;

     // Boundary is statically present and is a scalar: boundary does not depend
     // upon the indices and so it has been precomputed.
     hlfir::Entity boundary{op.getBoundary()};
     if (boundary.isScalar())
       return precomputedScalarBoundary;

     // Boundary is statically present and is an array: if the scalar
     // boundary has not been precomputed, this means that the data type
     // of the shifted values does not provide a way to compute
     // the default boundary value, so the array boundary must be dynamically
     // present, and we can load the boundary values from it.
     bool mustBePresent = !precomputedScalarBoundary;
     if (mustBePresent)
       return loadEoshiftVal(loc, builder, boundary, oneBasedIndices);

     // The array boundary may be dynamically absent.
     // In this case, precomputedScalarBoundary is a pre-computed scalar
     // boundary value that has to be used if boundaryIsScalarPred
     // is true, otherwise, the boundary value has to be loaded
     // from the boundary array.
     mlir::Type boundaryValTy = precomputedScalarBoundary.getType();
     mlir::Value newBoundaryVal =
         builder
             .genIfOp(loc, {boundaryValTy}, boundaryIsScalarPred,
                      /*withElseRegion=*/true)
             .genThen([&]() {
               fir::ResultOp::create(builder, loc, precomputedScalarBoundary);
             })
             .genElse([&]() {
               mlir::Value elem =
                   loadEoshiftVal(loc, builder, boundary, oneBasedIndices);
               fir::ResultOp::create(builder, loc, elem);
             })
             .getResults()[0];
     return newBoundaryVal;
   }

   /// Convert \p op into an hlfir.elemental using
   /// the pre-computed constant \p dimVal.
   static mlir::Operation *
   genElementalArrayShift(mlir::PatternRewriter &rewriter, Op op,
                          int64_t dimVal) {
     using Fortran::common::maxRank;
     hlfir::Entity shift = hlfir::Entity{op.getShift()};
     hlfir::Entity array = hlfir::Entity{op.getArray()};

     mlir::Location loc = op.getLoc();
     fir::FirOpBuilder builder{rewriter, op.getOperation()};
     // The new index computation involves MODULO, which is not implemented
     // for IndexType, so use I64 instead.
     mlir::Type calcType = builder.getI64Type();
     // Set the indices arithmetic overflow flags.
     setArithOverflowFlags(op, builder);

     mlir::Value arrayShape = hlfir::genShape(loc, builder, array);
     llvm::SmallVector<mlir::Value, maxRank> arrayExtents =
         hlfir::getExplicitExtentsFromShape(arrayShape, builder);
     llvm::SmallVector<mlir::Value, 1> typeParams;
     hlfir::genLengthParameters(loc, builder, array, typeParams);
     mlir::Value shiftDimExtent =
         builder.createConvert(loc, calcType, arrayExtents[dimVal - 1]);
     mlir::Value shiftVal;
     if (shift.isScalar()) {
       shiftVal = hlfir::loadTrivialScalar(loc, builder, shift);
       shiftVal =
           normalizeShiftValue(loc, builder, shiftVal, shiftDimExtent, calcType);
     }
     // The boundary operand of hlfir.eoshift may be statically or
     // dynamically absent.
     // In both cases, it is assumed to be a scalar with the value
     // corresponding to the array element type.
     // boundaryIsScalarPred is a dynamic predicate that identifies
     // these cases. If boundaryIsScalarPred is dynamicaly false,
     // then the boundary operand must be a present array.
     mlir::Value boundaryVal, boundaryIsScalarPred;
     if constexpr (std::is_same_v<Op, hlfir::EOShiftOp>)
       std::tie(boundaryVal, boundaryIsScalarPred) =
           genScalarBoundaryForEOShift(loc, builder, op);

     auto genKernel = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                          mlir::ValueRange inputIndices) -> hlfir::Entity {
       llvm::SmallVector<mlir::Value, maxRank> indices{inputIndices};
       if (!shiftVal) {
         // When the array is not a vector, section
         // (s(1), s(2), ..., s(dim-1), :, s(dim+1), ..., s(n)
         // of the result has a value equal to:
         // CSHIFT(ARRAY(s(1), s(2), ..., s(dim-1), :, s(dim+1), ..., s(n)),
         //        SH, 1),
         // where SH is either SHIFT (if scalar) or
         // SHIFT(s(1), s(2), ..., s(dim-1), s(dim+1), ..., s(n)).
         llvm::SmallVector<mlir::Value, maxRank> shiftIndices{indices};
         shiftIndices.erase(shiftIndices.begin() + dimVal - 1);
         hlfir::Entity shiftElement =
             hlfir::getElementAt(loc, builder, shift, shiftIndices);
         shiftVal = hlfir::loadTrivialScalar(loc, builder, shiftElement);
         shiftVal = normalizeShiftValue(loc, builder, shiftVal, shiftDimExtent,
                                        calcType);
       }
       if constexpr (std::is_same_v<Op, hlfir::EOShiftOp>) {
         llvm::SmallVector<mlir::Value, maxRank> boundaryIndices{indices};
         boundaryIndices.erase(boundaryIndices.begin() + dimVal - 1);
         boundaryVal =
             selectBoundaryValue(loc, builder, op, boundaryVal,
                                 boundaryIsScalarPred, boundaryIndices);
       }

       if constexpr (std::is_same_v<Op, hlfir::EOShiftOp>) {
         // EOSHIFT:
         // Element i of the result (1-based) is the element of the original
         // array (or its section, when ARRAY is not a vector) with index
         // (i + SH), if (1 <= i + SH <= SIZE(ARRAY,DIM)), otherwise
         // it is the BOUNDARY value.
         mlir::Value index =
             builder.createConvert(loc, calcType, inputIndices[dimVal - 1]);
         mlir::arith::IntegerOverflowFlags savedFlags =
             builder.getIntegerOverflowFlags();
         builder.setIntegerOverflowFlags(mlir::arith::IntegerOverflowFlags::nsw);
         mlir::Value indexPlusShift =
             mlir::arith::AddIOp::create(builder, loc, index, shiftVal);
         builder.setIntegerOverflowFlags(savedFlags);
         mlir::Value one = builder.createIntegerConstant(loc, calcType, 1);
         mlir::Value cmp1 = mlir::arith::CmpIOp::create(
             builder, loc, mlir::arith::CmpIPredicate::sge, indexPlusShift, one);
         mlir::Value cmp2 = mlir::arith::CmpIOp::create(
             builder, loc, mlir::arith::CmpIPredicate::sle, indexPlusShift,
             shiftDimExtent);
         mlir::Value loadFromArray =
             mlir::arith::AndIOp::create(builder, loc, cmp1, cmp2);
         mlir::Type boundaryValTy = boundaryVal.getType();
         mlir::Value result =
             builder
                 .genIfOp(loc, {boundaryValTy}, loadFromArray,
                          /*withElseRegion=*/true)
                 .genThen([&]() {
                   indices[dimVal - 1] = builder.createConvert(
                       loc, builder.getIndexType(), indexPlusShift);
                   ;
                   mlir::Value elem =
                       loadEoshiftVal(loc, builder, array, indices);
                   fir::ResultOp::create(builder, loc, elem);
                 })
                 .genElse(
                     [&]() { fir::ResultOp::create(builder, loc, boundaryVal); })
                 .getResults()[0];
         return hlfir::Entity{result};
       } else {
         // CSHIFT:
         // Element i of the result (1-based) is element
         // 'MODULO(i + SH - 1, SIZE(ARRAY,DIM)) + 1' (1-based) of the original
         // ARRAY (or its section, when ARRAY is not a vector).

         // Compute the index into the original array using the normalized
         // shift value, which satisfies (SH >= 0 && SH < SIZE(ARRAY,DIM)):
         //   newIndex =
         //     i + ((i <= SIZE(ARRAY,DIM) - SH) ? SH : SH - SIZE(ARRAY,DIM))
         //
         // Such index computation allows for further loop vectorization
         // in LLVM.
         mlir::Value wrapBound =
             mlir::arith::SubIOp::create(builder, loc, shiftDimExtent, shiftVal);
         mlir::Value adjustedShiftVal =
             mlir::arith::SubIOp::create(builder, loc, shiftVal, shiftDimExtent);
         mlir::Value index =
             builder.createConvert(loc, calcType, inputIndices[dimVal - 1]);
         mlir::Value wrapCheck = mlir::arith::CmpIOp::create(
             builder, loc, mlir::arith::CmpIPredicate::sle, index, wrapBound);
         mlir::Value actualShift = mlir::arith::SelectOp::create(
             builder, loc, wrapCheck, shiftVal, adjustedShiftVal);
         mlir::Value newIndex =
             mlir::arith::AddIOp::create(builder, loc, index, actualShift);
         newIndex = builder.createConvert(loc, builder.getIndexType(), newIndex);
         indices[dimVal - 1] = newIndex;
         hlfir::Entity element =
             hlfir::getElementAt(loc, builder, array, indices);
         return hlfir::loadTrivialScalar(loc, builder, element);
       }
     };

     mlir::Type elementType = array.getFortranElementType();
     hlfir::ElementalOp elementalOp = hlfir::genElementalOp(
         loc, builder, elementType, arrayShape, typeParams, genKernel,
         /*isUnordered=*/true,
         array.isPolymorphic() ? static_cast<mlir::Value>(array) : nullptr,
         op.getResult().getType());
     return elementalOp.getOperation();
   }

   /// Convert \p op into an hlfir.eval_in_mem using the pre-computed
   /// constant \p dimVal.
   /// The converted code for CSHIFT looks like this:
   ///   DEST_OFFSET = SIZE(ARRAY,DIM) - SH
   ///   COPY_END1 = SH
   ///   do i=1,COPY_END1
   ///     result(i + DEST_OFFSET) = array(i)
   ///   end
   ///   SOURCE_OFFSET = SH
   ///   COPY_END2 = SIZE(ARRAY,DIM) - SH
   ///   do i=1,COPY_END2
   ///     result(i) = array(i + SOURCE_OFFSET)
   ///   end
   /// Where SH is the normalized shift value, which satisfies
   /// (SH >= 0 && SH < SIZE(ARRAY,DIM)).
   ///
   /// The converted code for EOSHIFT looks like this:
   ///   EXTENT = SIZE(ARRAY,DIM)
   ///   DEST_OFFSET = SH < 0 ? -SH : 0
   ///   SOURCE_OFFSET = SH < 0 ? 0 : SH
   ///   COPY_END = SH < 0 ?
   ///       (-EXTENT > SH ? 0 : EXTENT + SH) :
   ///       (EXTENT < SH ? 0 : EXTENT - SH)
   ///   do i=1,COPY_END
   ///     result(i + DEST_OFFSET) = array(i + SOURCE_OFFSET)
   ///   end
   ///   INIT_END = EXTENT - COPY_END
   ///   INIT_OFFSET = SH < 0 ? 0 : COPY_END
   ///   do i=1,INIT_END
   ///     result(i + INIT_OFFSET) = BOUNDARY
   ///   end
   /// Where SH is the original shift value.
   ///
   /// When \p dimVal is 1, we generate the same code twice
   /// under a dynamic check for the contiguity of the leading
   /// dimension. In the code corresponding to the contiguous
   /// leading dimension, the shift dimension is represented
   /// as a contiguous slice of the original array.
   /// This allows recognizing the above two loops as memcpy
   /// loop idioms in LLVM.
   static mlir::Operation *genInMemArrayShift(mlir::PatternRewriter &rewriter,
                                              Op op, int64_t dimVal) {
     using Fortran::common::maxRank;
     hlfir::Entity shift = hlfir::Entity{op.getShift()};
     hlfir::Entity array = hlfir::Entity{op.getArray()};
     assert(array.isVariable() && "array must be a variable");
     assert(!array.isPolymorphic() &&
            "genInMemArrayShift does not support polymorphic types");
     mlir::Location loc = op.getLoc();
     fir::FirOpBuilder builder{rewriter, op.getOperation()};
     // The new index computation involves MODULO, which is not implemented
     // for IndexType, so use I64 instead.
     mlir::Type calcType = builder.getI64Type();
     // Set the indices arithmetic overflow flags.
     setArithOverflowFlags(op, builder);

     mlir::Value arrayShape = hlfir::genShape(loc, builder, array);
     llvm::SmallVector<mlir::Value, maxRank> arrayExtents =
         hlfir::getExplicitExtentsFromShape(arrayShape, builder);
     llvm::SmallVector<mlir::Value, 1> typeParams;
     hlfir::genLengthParameters(loc, builder, array, typeParams);
     mlir::Value shiftDimExtent =
         builder.createConvert(loc, calcType, arrayExtents[dimVal - 1]);
     mlir::Value shiftVal;
     if (shift.isScalar()) {
       shiftVal = hlfir::loadTrivialScalar(loc, builder, shift);
       shiftVal =
           normalizeShiftValue(loc, builder, shiftVal, shiftDimExtent, calcType);
     }
     // The boundary operand of hlfir.eoshift may be statically or
     // dynamically absent.
     // In both cases, it is assumed to be a scalar with the value
     // corresponding to the array element type.
     // boundaryIsScalarPred is a dynamic predicate that identifies
     // these cases. If boundaryIsScalarPred is dynamicaly false,
     // then the boundary operand must be a present array.
     mlir::Value boundaryVal, boundaryIsScalarPred;
     if constexpr (std::is_same_v<Op, hlfir::EOShiftOp>)
       std::tie(boundaryVal, boundaryIsScalarPred) =
           genScalarBoundaryForEOShift(loc, builder, op);

     hlfir::EvaluateInMemoryOp evalOp = hlfir::EvaluateInMemoryOp::create(
         builder, loc, mlir::cast<hlfir::ExprType>(op.getType()), arrayShape);
     builder.setInsertionPointToStart(&evalOp.getBody().front());

     mlir::Value resultArray = evalOp.getMemory();
     mlir::Type arrayType = fir::dyn_cast_ptrEleTy(resultArray.getType());
     resultArray = builder.createBox(loc, fir::BoxType::get(arrayType),
                                     resultArray, arrayShape, /*slice=*/nullptr,
                                     typeParams, /*tdesc=*/nullptr);

     // This is a generator of the dimension shift code.
     // The code is inserted inside a loop nest over the other dimensions
     // (if any). If exposeContiguity is true, the array's section
     // array(s(1), ..., s(dim-1), :, s(dim+1), ..., s(n)) is represented
     // as a contiguous 1D array.
     // For CSHIFT, shiftVal is the normalized shift value that satisfies
     // (SH >= 0 && SH < SIZE(ARRAY,DIM)).
     //
     auto genDimensionShift = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                                  mlir::Value shiftVal, mlir::Value boundary,
                                  bool exposeContiguity,
                                  mlir::ValueRange oneBasedIndices)
         -> llvm::SmallVector<mlir::Value, 0> {
       // Create a vector of indices (s(1), ..., s(dim-1), nullptr, s(dim+1),
       // ..., s(n)) so that we can update the dimVal index as needed.
       llvm::SmallVector<mlir::Value, maxRank> srcIndices(
           oneBasedIndices.begin(), oneBasedIndices.begin() + (dimVal - 1));
       srcIndices.push_back(nullptr);
       srcIndices.append(oneBasedIndices.begin() + (dimVal - 1),
                         oneBasedIndices.end());
       llvm::SmallVector<mlir::Value, maxRank> dstIndices(srcIndices);

       hlfir::Entity srcArray = array;
       if (exposeContiguity && mlir::isa<fir::BaseBoxType>(srcArray.getType())) {
         assert(dimVal == 1 && "can expose contiguity only for dim 1");
         llvm::SmallVector<mlir::Value, maxRank> arrayLbounds =
             hlfir::genLowerbounds(loc, builder, arrayShape, array.getRank());
         hlfir::Entity section =
             hlfir::gen1DSection(loc, builder, srcArray, dimVal, arrayLbounds,
                                 arrayExtents, oneBasedIndices, typeParams);
         mlir::Value addr = hlfir::genVariableRawAddress(loc, builder, section);
         mlir::Value shape = hlfir::genShape(loc, builder, section);
         mlir::Type boxType = fir::wrapInClassOrBoxType(
             hlfir::getFortranElementOrSequenceType(section.getType()),
             section.isPolymorphic());
         srcArray = hlfir::Entity{
             builder.createBox(loc, boxType, addr, shape, /*slice=*/nullptr,
                               /*lengths=*/{}, /*tdesc=*/nullptr)};
         // When shifting the dimension as a 1D section of the original
         // array, we only need one index for addressing.
         srcIndices.resize(1);
       }

       // genCopy labda generates the body of a generic copy loop.
       //   do i=1,COPY_END
       //     result(i + DEST_OFFSET) = array(i + SOURCE_OFFSET)
       //   end
       //
       // It is parameterized by DEST_OFFSET and SOURCE_OFFSET.
       mlir::Value dstOffset, srcOffset;
       auto genCopy = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                          mlir::ValueRange index, mlir::ValueRange reductionArgs)
           -> llvm::SmallVector<mlir::Value, 0> {
         assert(index.size() == 1 && "expected single loop");
         mlir::Value srcIndex = builder.createConvert(loc, calcType, index[0]);
         mlir::Value dstIndex = srcIndex;
         if (srcOffset)
           srcIndex =
               mlir::arith::AddIOp::create(builder, loc, srcIndex, srcOffset);
         srcIndices[dimVal - 1] = srcIndex;
         hlfir::Entity srcElementValue =
             hlfir::loadElementAt(loc, builder, srcArray, srcIndices);
         if (dstOffset)
           dstIndex =
               mlir::arith::AddIOp::create(builder, loc, dstIndex, dstOffset);
         dstIndices[dimVal - 1] = dstIndex;
         hlfir::Entity dstElement = hlfir::getElementAt(
             loc, builder, hlfir::Entity{resultArray}, dstIndices);
         hlfir::AssignOp::create(builder, loc, srcElementValue, dstElement);
         // Reset the external parameters' values to make sure
         // they are properly updated between the labda calls.
         // WARNING: if genLoopNestWithReductions() calls the lambda
         // multiple times, this is going to be a problem.
         dstOffset = nullptr;
         srcOffset = nullptr;
         return {};
       };

       if constexpr (std::is_same_v<Op, hlfir::CShiftOp>) {
         // Copy first portion of the array:
         //   DEST_OFFSET = SIZE(ARRAY,DIM) - SH
         //   COPY_END1 = SH
         //   do i=1,COPY_END1
         //     result(i + DEST_OFFSET) = array(i)
         //   end
         dstOffset =
             mlir::arith::SubIOp::create(builder, loc, shiftDimExtent, shiftVal);
         srcOffset = nullptr;
         hlfir::genLoopNestWithReductions(loc, builder, {shiftVal},
                                          /*reductionInits=*/{}, genCopy,
                                          /*isUnordered=*/true);

         // Copy second portion of the array:
         //   SOURCE_OFFSET = SH
         //   COPY_END2 = SIZE(ARRAY,DIM) - SH
         //   do i=1,COPY_END2
         //     result(i) = array(i + SOURCE_OFFSET)
         //   end
         mlir::Value bound =
             mlir::arith::SubIOp::create(builder, loc, shiftDimExtent, shiftVal);
         dstOffset = nullptr;
         srcOffset = shiftVal;
         hlfir::genLoopNestWithReductions(loc, builder, {bound},
                                          /*reductionInits=*/{}, genCopy,
                                          /*isUnordered=*/true);
       } else {
         // Do the copy:
         //   EXTENT = SIZE(ARRAY,DIM)
         //   DEST_OFFSET = SH < 0 ? -SH : 0
         //   SOURCE_OFFSET = SH < 0 ? 0 : SH
         //   COPY_END = SH < 0 ?
         //       (-EXTENT > SH ? 0 : EXTENT + SH) :
         //       (EXTENT < SH ? 0 : EXTENT - SH)
         //   do i=1,COPY_END
         //     result(i + DEST_OFFSET) = array(i + SOURCE_OFFSET)
         //   end
         mlir::arith::IntegerOverflowFlags savedFlags =
             builder.getIntegerOverflowFlags();
         builder.setIntegerOverflowFlags(mlir::arith::IntegerOverflowFlags::nsw);

         mlir::Value zero = builder.createIntegerConstant(loc, calcType, 0);
         mlir::Value isNegativeShift = mlir::arith::CmpIOp::create(
             builder, loc, mlir::arith::CmpIPredicate::slt, shiftVal, zero);
         mlir::Value shiftNeg =
             mlir::arith::SubIOp::create(builder, loc, zero, shiftVal);
         dstOffset = mlir::arith::SelectOp::create(builder, loc, isNegativeShift,
                                                   shiftNeg, zero);
         srcOffset = mlir::arith::SelectOp::create(builder, loc, isNegativeShift,
                                                   zero, shiftVal);
         mlir::Value extentNeg =
             mlir::arith::SubIOp::create(builder, loc, zero, shiftDimExtent);
         mlir::Value extentPlusShift =
             mlir::arith::AddIOp::create(builder, loc, shiftDimExtent, shiftVal);
         mlir::Value extentNegShiftCmp = mlir::arith::CmpIOp::create(
             builder, loc, mlir::arith::CmpIPredicate::sgt, extentNeg, shiftVal);
         mlir::Value negativeShiftBound = mlir::arith::SelectOp::create(
             builder, loc, extentNegShiftCmp, zero, extentPlusShift);
         mlir::Value extentMinusShift =
             mlir::arith::SubIOp::create(builder, loc, shiftDimExtent, shiftVal);
         mlir::Value extentShiftCmp = mlir::arith::CmpIOp::create(
             builder, loc, mlir::arith::CmpIPredicate::slt, shiftDimExtent,
             shiftVal);
         mlir::Value positiveShiftBound = mlir::arith::SelectOp::create(
             builder, loc, extentShiftCmp, zero, extentMinusShift);
         mlir::Value copyEnd = mlir::arith::SelectOp::create(
             builder, loc, isNegativeShift, negativeShiftBound,
             positiveShiftBound);
         hlfir::genLoopNestWithReductions(loc, builder, {copyEnd},
                                          /*reductionInits=*/{}, genCopy,
                                          /*isUnordered=*/true);

         // Do the init:
         //   INIT_END = EXTENT - COPY_END
         //   INIT_OFFSET = SH < 0 ? 0 : COPY_END
         //   do i=1,INIT_END
         //     result(i + INIT_OFFSET) = BOUNDARY
         //   end
         assert(boundary && "boundary cannot be null");
         mlir::Value initEnd =
             mlir::arith::SubIOp::create(builder, loc, shiftDimExtent, copyEnd);
         mlir::Value initOffset = mlir::arith::SelectOp::create(
             builder, loc, isNegativeShift, zero, copyEnd);
         auto genInit = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                            mlir::ValueRange index,
                            mlir::ValueRange reductionArgs)
             -> llvm::SmallVector<mlir::Value, 0> {
           mlir::Value dstIndex = builder.createConvert(loc, calcType, index[0]);
           dstIndex =
               mlir::arith::AddIOp::create(builder, loc, dstIndex, initOffset);
           dstIndices[dimVal - 1] = dstIndex;
           hlfir::Entity dstElement = hlfir::getElementAt(
               loc, builder, hlfir::Entity{resultArray}, dstIndices);
           hlfir::AssignOp::create(builder, loc, boundary, dstElement);
           return {};
         };
         hlfir::genLoopNestWithReductions(loc, builder, {initEnd},
                                          /*reductionInits=*/{}, genInit,
                                          /*isUnordered=*/true);
         builder.setIntegerOverflowFlags(savedFlags);
       }
       return {};
     };

     // A wrapper around genDimensionShift that computes the normalized
     // shift value and manages the insertion of the multiple versions
     // of the shift based on the dynamic check of the leading dimension's
     // contiguity (when dimVal == 1).
     auto genShiftBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                             mlir::ValueRange oneBasedIndices,
                             mlir::ValueRange reductionArgs)
         -> llvm::SmallVector<mlir::Value, 0> {
       // Copy the dimension with a shift:
       // SH is either SHIFT (if scalar) or SHIFT(oneBasedIndices).
       if (!shiftVal) {
         assert(!oneBasedIndices.empty() && "scalar shift must be precomputed");
         hlfir::Entity shiftElement =
             hlfir::getElementAt(loc, builder, shift, oneBasedIndices);
         shiftVal = hlfir::loadTrivialScalar(loc, builder, shiftElement);
         shiftVal = normalizeShiftValue(loc, builder, shiftVal, shiftDimExtent,
                                        calcType);
       }
       if constexpr (std::is_same_v<Op, hlfir::EOShiftOp>)
         boundaryVal =
             selectBoundaryValue(loc, builder, op, boundaryVal,
                                 boundaryIsScalarPred, oneBasedIndices);

       // If we can fetch the byte stride of the leading dimension,
       // and the byte size of the element, then we can generate
       // a dynamic contiguity check and expose the leading dimension's
       // contiguity in FIR, making memcpy loop idiom recognition
       // possible.
       mlir::Value elemSize;
       mlir::Value stride;
       if (dimVal == 1 && mlir::isa<fir::BaseBoxType>(array.getType())) {
         mlir::Type indexType = builder.getIndexType();
         elemSize =
             fir::BoxEleSizeOp::create(builder, loc, indexType, array.getBase());
         mlir::Value dimIdx =
             builder.createIntegerConstant(loc, indexType, dimVal - 1);
         auto boxDim =
             fir::BoxDimsOp::create(builder, loc, indexType, indexType,
                                    indexType, array.getBase(), dimIdx);
         stride = boxDim.getByteStride();
       }

       if (array.isSimplyContiguous() || !elemSize || !stride) {
         genDimensionShift(loc, builder, shiftVal, boundaryVal,
                           /*exposeContiguity=*/false, oneBasedIndices);
         return {};
       }

       mlir::Value isContiguous = mlir::arith::CmpIOp::create(
           builder, loc, mlir::arith::CmpIPredicate::eq, elemSize, stride);
       builder.genIfOp(loc, {}, isContiguous, /*withElseRegion=*/true)
           .genThen([&]() {
             genDimensionShift(loc, builder, shiftVal, boundaryVal,
                               /*exposeContiguity=*/true, oneBasedIndices);
           })
           .genElse([&]() {
             genDimensionShift(loc, builder, shiftVal, boundaryVal,
                               /*exposeContiguity=*/false, oneBasedIndices);
           });

       return {};
     };

     // For 1D case, generate a single loop.
     // For ND case, generate a loop nest over the other dimensions
     // with a single loop inside (generated separately).
     llvm::SmallVector<mlir::Value, maxRank> newExtents(arrayExtents);
     newExtents.erase(newExtents.begin() + (dimVal - 1));
     if (!newExtents.empty())
       hlfir::genLoopNestWithReductions(loc, builder, newExtents,
                                        /*reductionInits=*/{}, genShiftBody,
                                        /*isUnordered=*/true);
     else
       genShiftBody(loc, builder, {}, {});

     return evalOp.getOperation();
   }
 };

 class CmpCharOpConversion : public mlir::OpRewritePattern<hlfir::CmpCharOp> {
 public:
   using mlir::OpRewritePattern<hlfir::CmpCharOp>::OpRewritePattern;

   llvm::LogicalResult
   matchAndRewrite(hlfir::CmpCharOp cmp,
                   mlir::PatternRewriter &rewriter) const override {

     fir::FirOpBuilder builder{rewriter, cmp.getOperation()};
     const mlir::Location &loc = cmp->getLoc();

     auto toVariable =
         [&builder,
          &loc](mlir::Value val) -> std::pair<mlir::Value, hlfir::AssociateOp> {
       mlir::Value opnd;
       hlfir::AssociateOp associate;
       if (mlir::isa<hlfir::ExprType>(val.getType())) {
         hlfir::Entity entity{val};
         mlir::NamedAttribute byRefAttr = fir::getAdaptToByRefAttr(builder);
         associate = hlfir::genAssociateExpr(loc, builder, entity,
                                             entity.getType(), "", byRefAttr);
         opnd = associate.getBase();
       } else {
         opnd = val;
       }
       return {opnd, associate};
     };

     auto [lhsOpnd, lhsAssociate] = toVariable(cmp.getLchr());
     auto [rhsOpnd, rhsAssociate] = toVariable(cmp.getRchr());

     hlfir::Entity lhs{lhsOpnd};
     hlfir::Entity rhs{rhsOpnd};

     auto charTy = mlir::cast<fir::CharacterType>(lhs.getFortranElementType());
     unsigned kind = charTy.getFKind();

     auto bits = builder.getKindMap().getCharacterBitsize(kind);
     auto intTy = builder.getIntegerType(bits);

     auto idxTy = builder.getIndexType();
     auto charLen1Ty =
         fir::CharacterType::getSingleton(builder.getContext(), kind);
     mlir::Type designatorType =
         fir::ReferenceType::get(charLen1Ty, fir::isa_volatile_type(charTy));
     auto idxAttr = builder.getIntegerAttr(idxTy, 0);

     auto genExtractAndConvertToInt =
         [&idxAttr, &intTy, &designatorType](
             mlir::Location loc, fir::FirOpBuilder &builder,
             hlfir::Entity &charStr, mlir::Value index, mlir::Value length) {
           auto singleChr = hlfir::DesignateOp::create(
               builder, loc, designatorType, charStr, /*component=*/{},
               /*compShape=*/mlir::Value{}, hlfir::DesignateOp::Subscripts{},
               /*substring=*/mlir::ValueRange{index, index},
               /*complexPart=*/std::nullopt,
               /*shape=*/mlir::Value{}, /*typeParams=*/mlir::ValueRange{length},
               fir::FortranVariableFlagsAttr{});
           auto chrVal = fir::LoadOp::create(builder, loc, singleChr);
           mlir::Value intVal = fir::ExtractValueOp::create(
               builder, loc, intTy, chrVal, builder.getArrayAttr(idxAttr));
           return intVal;
         };

     mlir::arith::CmpIPredicate predicate = cmp.getPredicate();
     mlir::Value oneIdx = builder.createIntegerConstant(loc, idxTy, 1);

     mlir::Value lhsLen = builder.createConvert(
         loc, idxTy, hlfir::genCharLength(loc, builder, lhs));
     mlir::Value rhsLen = builder.createConvert(
         loc, idxTy, hlfir::genCharLength(loc, builder, rhs));

     enum class GenCmp { LeftToRight, LeftToBlank, BlankToRight };

     mlir::Value zeroInt = builder.createIntegerConstant(loc, intTy, 0);
     mlir::Value oneInt = builder.createIntegerConstant(loc, intTy, 1);
     mlir::Value negOneInt = builder.createIntegerConstant(loc, intTy, -1);
     mlir::Value blankInt = builder.createIntegerConstant(loc, intTy, ' ');

     auto step = GenCmp::LeftToRight;
     auto genCmp = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                       mlir::ValueRange index, mlir::ValueRange reductionArgs)
         -> llvm::SmallVector<mlir::Value, 1> {
       assert(index.size() == 1 && "expected single loop");
       assert(reductionArgs.size() == 1 && "expected single reduction value");
       mlir::Value inRes = reductionArgs[0];
       auto accEQzero = mlir::arith::CmpIOp::create(
           builder, loc, mlir::arith::CmpIPredicate::eq, inRes, zeroInt);

       mlir::Value res =
           builder
               .genIfOp(loc, {intTy}, accEQzero,
                        /*withElseRegion=*/true)
               .genThen([&]() {
                 mlir::Value offset =
                     builder.createConvert(loc, idxTy, index[0]);
                 mlir::Value lhsInt;
                 mlir::Value rhsInt;
                 if (step == GenCmp::LeftToRight) {
                   lhsInt = genExtractAndConvertToInt(loc, builder, lhs, offset,
                                                      oneIdx);
                   rhsInt = genExtractAndConvertToInt(loc, builder, rhs, offset,
                                                      oneIdx);
                 } else if (step == GenCmp::LeftToBlank) {
                   // lhsLen > rhsLen
                   offset =
                       mlir::arith::AddIOp::create(builder, loc, rhsLen, offset);

                   lhsInt = genExtractAndConvertToInt(loc, builder, lhs, offset,
                                                      oneIdx);
                   rhsInt = blankInt;
                 } else if (step == GenCmp::BlankToRight) {
                   // rhsLen > lhsLen
                   offset =
                       mlir::arith::AddIOp::create(builder, loc, lhsLen, offset);

                   lhsInt = blankInt;
                   rhsInt = genExtractAndConvertToInt(loc, builder, rhs, offset,
                                                      oneIdx);
                 } else {
                   llvm_unreachable(
                       "unknown compare step for CmpCharOp lowering");
                 }

                 mlir::Value newVal = mlir::arith::SelectOp::create(
                     builder, loc,
                     mlir::arith::CmpIOp::create(builder, loc,
                                                 mlir::arith::CmpIPredicate::ult,
                                                 lhsInt, rhsInt),
                     negOneInt, inRes);
                 newVal = mlir::arith::SelectOp::create(
                     builder, loc,
                     mlir::arith::CmpIOp::create(builder, loc,
                                                 mlir::arith::CmpIPredicate::ugt,
                                                 lhsInt, rhsInt),
                     oneInt, newVal);
                 fir::ResultOp::create(builder, loc, newVal);
               })
               .genElse([&]() { fir::ResultOp::create(builder, loc, inRes); })
               .getResults()[0];

       return {res};
     };

     // First generate comparison of two strings for the legth of the shorter
     // one.
     mlir::Value minLen = mlir::arith::SelectOp::create(
         builder, loc,
         mlir::arith::CmpIOp::create(
             builder, loc, mlir::arith::CmpIPredicate::slt, lhsLen, rhsLen),
         lhsLen, rhsLen);

     llvm::SmallVector<mlir::Value, 1> loopOut =
         hlfir::genLoopNestWithReductions(loc, builder, {minLen},
                                          /*reductionInits=*/{zeroInt}, genCmp,
                                          /*isUnordered=*/false);
     mlir::Value partRes = loopOut[0];

     auto lhsLonger = mlir::arith::CmpIOp::create(
         builder, loc, mlir::arith::CmpIPredicate::sgt, lhsLen, rhsLen);
     mlir::Value tempRes =
         builder
             .genIfOp(loc, {intTy}, lhsLonger,
                      /*withElseRegion=*/true)
             .genThen([&]() {
               // If left is the longer string generate compare left to blank.
               step = GenCmp::LeftToBlank;
               auto lenDiff =
                   mlir::arith::SubIOp::create(builder, loc, lhsLen, rhsLen);

               llvm::SmallVector<mlir::Value, 1> output =
                   hlfir::genLoopNestWithReductions(loc, builder, {lenDiff},
                                                    /*reductionInits=*/{partRes},
                                                    genCmp,
                                                    /*isUnordered=*/false);
               mlir::Value res = output[0];
               fir::ResultOp::create(builder, loc, res);
             })
             .genElse([&]() {
               // If right  is the longer string generate compare blank to
               // right.
               step = GenCmp::BlankToRight;
               auto lenDiff =
                   mlir::arith::SubIOp::create(builder, loc, rhsLen, lhsLen);
               llvm::SmallVector<mlir::Value, 1> output =
                   hlfir::genLoopNestWithReductions(loc, builder, {lenDiff},
                                                    /*reductionInits=*/{partRes},
                                                    genCmp,
                                                    /*isUnordered=*/false);

               mlir::Value res = output[0];
               fir::ResultOp::create(builder, loc, res);
             })
             .getResults()[0];
     if (lhsAssociate)
       hlfir::EndAssociateOp::create(builder, loc, lhsAssociate);
     if (rhsAssociate)
       hlfir::EndAssociateOp::create(builder, loc, rhsAssociate);

     auto finalCmpResult =
         mlir::arith::CmpIOp::create(builder, loc, predicate, tempRes, zeroInt);
     rewriter.replaceOp(cmp, finalCmpResult);
     return mlir::success();
   }
 };

 static std::pair<mlir::Value, hlfir::AssociateOp>
 getVariable(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value val) {
   // If it is an expression - create a variable from it, or forward
   // the value otherwise.
   hlfir::AssociateOp associate;
   if (!mlir::isa<hlfir::ExprType>(val.getType()))
     return {val, associate};
   hlfir::Entity entity{val};
   mlir::NamedAttribute byRefAttr = fir::getAdaptToByRefAttr(builder);
   associate = hlfir::genAssociateExpr(loc, builder, entity, entity.getType(),
                                       "", byRefAttr);
   return {associate.getBase(), associate};
 }

 class IndexOpConversion : public mlir::OpRewritePattern<hlfir::IndexOp> {
 public:
   using mlir::OpRewritePattern<hlfir::IndexOp>::OpRewritePattern;

   llvm::LogicalResult
   matchAndRewrite(hlfir::IndexOp op,
                   mlir::PatternRewriter &rewriter) const override {
     // We simplify only limited cases:
     // 1) a substring length shall be known at compile time
     // 2) if a substring length is 0 then replace with 1 for forward search,
     //    or otherwise with the string length + 1 (builder shall const-fold if
     //    lookup direction is known at compile time).
     // 3) for known string length at compile time, if it is
     //    shorter than substring  => replace with zero.
     // 4) if a substring length is one => inline as simple search loop
     // 5) for forward search with input strings of kind=1 runtime is faster.
     // Do not simplify in all the other cases relying on a runtime call.

     fir::FirOpBuilder builder{rewriter, op.getOperation()};
     const mlir::Location &loc = op->getLoc();

     auto resultTy = op.getType();
     mlir::Value back = op.getBack();
     auto substrLenCst =
         hlfir::getCharLengthIfConst(hlfir::Entity{op.getSubstr()});
     if (!substrLenCst) {
       return rewriter.notifyMatchFailure(
           op, "substring length unknown at compile time");
     }
     hlfir::Entity strEntity{op.getStr()};
     auto i1Ty = builder.getI1Type();
     auto idxTy = builder.getIndexType();
     if (*substrLenCst == 0) {
       mlir::Value oneIdx = builder.createIntegerConstant(loc, idxTy, 1);
       // zero length substring. For back search replace with
       // strLen+1, or otherwise with 1.
       mlir::Value strLen = hlfir::genCharLength(loc, builder, strEntity);
       mlir::Value strEnd = mlir::arith::AddIOp::create(
           builder, loc, builder.createConvert(loc, idxTy, strLen), oneIdx);
       if (back)
         back = builder.createConvert(loc, i1Ty, back);
       else
         back = builder.createIntegerConstant(loc, i1Ty, 0);
       mlir::Value result =
           mlir::arith::SelectOp::create(builder, loc, back, strEnd, oneIdx);

       rewriter.replaceOp(op, builder.createConvert(loc, resultTy, result));
       return mlir::success();
     }

     if (auto strLenCst = hlfir::getCharLengthIfConst(strEntity)) {
       if (*strLenCst < *substrLenCst) {
         rewriter.replaceOp(op, builder.createIntegerConstant(loc, resultTy, 0));
         return mlir::success();
       }
       if (*strLenCst == 0) {
         // both strings have zero length
         rewriter.replaceOp(op, builder.createIntegerConstant(loc, resultTy, 1));
         return mlir::success();
       }
     }
     if (*substrLenCst != 1) {
       return rewriter.notifyMatchFailure(
           op, "rely on runtime implementation if substring length > 1");
     }
     // For forward search and character kind=1 the runtime uses memchr
     // which well optimized. But it looks like memchr idiom is not recognized
     // in LLVM yet. On a micro-kernel test with strings of length 40 runtime
     // had ~2x less execution time vs inlined code. For unknown search direction
     // at compile time pessimistically assume "forward".
     std::optional<bool> isBack;
     if (back) {
       if (auto backCst = fir::getIntIfConstant(back))
         isBack = *backCst != 0;
     } else {
       isBack = false;
     }
     auto charTy = mlir::cast<fir::CharacterType>(
         hlfir::getFortranElementType(op.getSubstr().getType()));
     unsigned kind = charTy.getFKind();
     if (kind == 1 && (!isBack || !*isBack)) {
       return rewriter.notifyMatchFailure(
           op, "rely on runtime implementation for character kind 1");
     }

     // All checks are passed here. Generate single character search loop.
     auto [strV, strAssociate] = getVariable(builder, loc, op.getStr());
     auto [substrV, substrAssociate] = getVariable(builder, loc, op.getSubstr());
     hlfir::Entity str{strV};
     hlfir::Entity substr{substrV};
     mlir::Value oneIdx = builder.createIntegerConstant(loc, idxTy, 1);

     auto genExtractAndConvertToInt = [&charTy, &idxTy, &oneIdx,
                                       kind](mlir::Location loc,
                                             fir::FirOpBuilder &builder,
                                             hlfir::Entity &charStr,
                                             mlir::Value index) {
       auto bits = builder.getKindMap().getCharacterBitsize(kind);
       auto intTy = builder.getIntegerType(bits);
       auto charLen1Ty =
           fir::CharacterType::getSingleton(builder.getContext(), kind);
       mlir::Type designatorTy =
           fir::ReferenceType::get(charLen1Ty, fir::isa_volatile_type(charTy));
       auto idxAttr = builder.getIntegerAttr(idxTy, 0);

       auto singleChr = hlfir::DesignateOp::create(
           builder, loc, designatorTy, charStr, /*component=*/{},
           /*compShape=*/mlir::Value{}, hlfir::DesignateOp::Subscripts{},
           /*substring=*/mlir::ValueRange{index, index},
           /*complexPart=*/std::nullopt,
           /*shape=*/mlir::Value{}, /*typeParams=*/mlir::ValueRange{oneIdx},
           fir::FortranVariableFlagsAttr{});
       auto chrVal = fir::LoadOp::create(builder, loc, singleChr);
       mlir::Value intVal = fir::ExtractValueOp::create(
           builder, loc, intTy, chrVal, builder.getArrayAttr(idxAttr));
       return intVal;
     };

     auto wantChar = genExtractAndConvertToInt(loc, builder, substr, oneIdx);

     // Generate search loop body with the following C equivalent:
     //  idx_t result = 0;
     //  idx_t end = strlen + 1;
     //  char want = substr[0];
     //  for (idx_t idx = 1; idx < end; ++idx) {
     //    if (result == 0) {
     //        idx_t at = back ? end - idx: idx;
     //        result = str[at-1] == want ? at : result;
     //    }
     //  }
     mlir::Value strLen = hlfir::genCharLength(loc, builder, strEntity);
     if (!back)
       back = builder.createIntegerConstant(loc, i1Ty, 0);
     else
       back = builder.createConvert(loc, i1Ty, back);
     mlir::Value strEnd = mlir::arith::AddIOp::create(
         builder, loc, builder.createConvert(loc, idxTy, strLen), oneIdx);
     mlir::Value zeroIdx = builder.createIntegerConstant(loc, idxTy, 0);
     auto genSearchBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                              mlir::ValueRange index,
                              mlir::ValueRange reductionArgs)
         -> llvm::SmallVector<mlir::Value, 1> {
       assert(index.size() == 1 && "expected single loop");
       assert(reductionArgs.size() == 1 && "expected single reduction value");
       mlir::Value inRes = reductionArgs[0];
       auto resEQzero = mlir::arith::CmpIOp::create(
           builder, loc, mlir::arith::CmpIPredicate::eq, inRes, zeroIdx);

       mlir::Value res =
           builder
               .genIfOp(loc, {idxTy}, resEQzero,
                        /*withElseRegion=*/true)
               .genThen([&]() {
                 mlir::Value idx = builder.createConvert(loc, idxTy, index[0]);
                 // offset = back ? end - idx : idx;
                 mlir::Value offset = mlir::arith::SelectOp::create(
                     builder, loc, back,
                     mlir::arith::SubIOp::create(builder, loc, strEnd, idx),
                     idx);

                 auto haveChar =
                     genExtractAndConvertToInt(loc, builder, str, offset);
                 auto charsEQ = mlir::arith::CmpIOp::create(
                     builder, loc, mlir::arith::CmpIPredicate::eq, haveChar,
                     wantChar);
                 mlir::Value newVal = mlir::arith::SelectOp::create(
                     builder, loc, charsEQ, offset, inRes);

                 fir::ResultOp::create(builder, loc, newVal);
               })
               .genElse([&]() { fir::ResultOp::create(builder, loc, inRes); })
               .getResults()[0];
       return {res};
     };

     llvm::SmallVector<mlir::Value, 1> loopOut =
         hlfir::genLoopNestWithReductions(loc, builder, {strLen},
                                          /*reductionInits=*/{zeroIdx},
                                          genSearchBody,
                                          /*isUnordered=*/false);
     mlir::Value result = builder.createConvert(loc, resultTy, loopOut[0]);

     if (strAssociate)
       hlfir::EndAssociateOp::create(builder, loc, strAssociate);
     if (substrAssociate)
       hlfir::EndAssociateOp::create(builder, loc, substrAssociate);

     rewriter.replaceOp(op, result);
     return mlir::success();
   }
 };

 template <typename Op>
 class MatmulConversion : public mlir::OpRewritePattern<Op> {
 public:
   using mlir::OpRewritePattern<Op>::OpRewritePattern;

   llvm::LogicalResult
   matchAndRewrite(Op matmul, mlir::PatternRewriter &rewriter) const override {
     mlir::Location loc = matmul.getLoc();
     fir::FirOpBuilder builder{rewriter, matmul.getOperation()};
     hlfir::Entity lhs = hlfir::Entity{matmul.getLhs()};
     hlfir::Entity rhs = hlfir::Entity{matmul.getRhs()};
     mlir::Value resultShape, innerProductExtent;
     std::tie(resultShape, innerProductExtent) =
         genResultShape(loc, builder, lhs, rhs);

     if (forceMatmulAsElemental || isMatmulTranspose) {
       // Generate hlfir.elemental that produces the result of
       // MATMUL/MATMUL(TRANSPOSE).
       // Note that this implementation is very suboptimal for MATMUL,
       // but is quite good for MATMUL(TRANSPOSE), e.g.:
       //   R(1:N) = R(1:N) + MATMUL(TRANSPOSE(X(1:N,1:N)), Y(1:N))
       // Inlining MATMUL(TRANSPOSE) as hlfir.elemental may result
       // in merging the inner product computation with the elemental
       // addition. Note that the inner product computation will
       // benefit from processing the lowermost dimensions of X and Y,
       // which may be the best when they are contiguous.
       //
       // This is why we always inline MATMUL(TRANSPOSE) as an elemental.
       // MATMUL is inlined below by default unless forceMatmulAsElemental.
       hlfir::ExprType resultType =
           mlir::cast<hlfir::ExprType>(matmul.getType());
       hlfir::ElementalOp newOp = genElementalMatmul(
           loc, builder, resultType, resultShape, lhs, rhs, innerProductExtent);
       rewriter.replaceOp(matmul, newOp);
       return mlir::success();
     }

     // Generate hlfir.eval_in_mem to mimic the MATMUL implementation
     // from Fortran runtime. The implementation needs to operate
     // with the result array as an in-memory object.
     hlfir::EvaluateInMemoryOp evalOp = hlfir::EvaluateInMemoryOp::create(
         builder, loc, mlir::cast<hlfir::ExprType>(matmul.getType()),
         resultShape);
     builder.setInsertionPointToStart(&evalOp.getBody().front());

     // Embox the raw array pointer to simplify designating it.
     // TODO: this currently results in redundant lower bounds
     // addition for the designator, but this should be fixed in
     // hlfir::Entity::mayHaveNonDefaultLowerBounds().
     mlir::Value resultArray = evalOp.getMemory();
     mlir::Type arrayType = fir::dyn_cast_ptrEleTy(resultArray.getType());
     resultArray = builder.createBox(loc, fir::BoxType::get(arrayType),
                                     resultArray, resultShape, /*slice=*/nullptr,
                                     /*lengths=*/{}, /*tdesc=*/nullptr);

     // The contiguous MATMUL version is best for the cases
     // where the input arrays and (maybe) the result are contiguous
     // in their lowermost dimensions.
     // Especially, when LLVM can recognize the continuity
     // and vectorize the loops properly.
     // Note that the contiguous MATMUL inlining is correct
     // even when the input arrays are not contiguous.
     // TODO: we can try to recognize the cases when the continuity
     // is not statically obvious and try to generate an explicitly
     // continuous version under a dynamic check. This should allow
     // LLVM to vectorize the loops better. Note that this can
     // also be postponed up to the LoopVersioning pass.
     // The fallback implementation may use genElementalMatmul() with
     // an hlfir.assign into the result of eval_in_mem.
     mlir::LogicalResult rewriteResult =
         genContiguousMatmul(loc, builder, hlfir::Entity{resultArray},
                             resultShape, lhs, rhs, innerProductExtent);

     if (mlir::failed(rewriteResult)) {
       // Erase the unclaimed eval_in_mem op.
       rewriter.eraseOp(evalOp);
       return rewriter.notifyMatchFailure(matmul,
                                          "genContiguousMatmul() failed");
     }

     rewriter.replaceOp(matmul, evalOp);
     return mlir::success();
   }

 private:
   static constexpr bool isMatmulTranspose =
       std::is_same_v<Op, hlfir::MatmulTransposeOp>;

   // Return a tuple of:
   //   * A fir.shape operation representing the shape of the result
   //     of a MATMUL/MATMUL(TRANSPOSE).
   //   * An extent of the dimensions of the input array
   //     that are processed during the inner product computation.
   static std::tuple<mlir::Value, mlir::Value>
   genResultShape(mlir::Location loc, fir::FirOpBuilder &builder,
                  hlfir::Entity input1, hlfir::Entity input2) {
     llvm::SmallVector<mlir::Value, 2> input1Extents =
         hlfir::genExtentsVector(loc, builder, input1);
     llvm::SmallVector<mlir::Value, 2> input2Extents =
         hlfir::genExtentsVector(loc, builder, input2);

     llvm::SmallVector<mlir::Value, 2> newExtents;
     mlir::Value innerProduct1Extent, innerProduct2Extent;
     if (input1Extents.size() == 1) {
       assert(!isMatmulTranspose &&
              "hlfir.matmul_transpose's first operand must be rank-2 array");
       assert(input2Extents.size() == 2 &&
              "hlfir.matmul second argument must be rank-2 array");
       newExtents.push_back(input2Extents[1]);
       innerProduct1Extent = input1Extents[0];
       innerProduct2Extent = input2Extents[0];
     } else {
       if (input2Extents.size() == 1) {
         assert(input1Extents.size() == 2 &&
                "hlfir.matmul first argument must be rank-2 array");
         if constexpr (isMatmulTranspose)
           newExtents.push_back(input1Extents[1]);
         else
           newExtents.push_back(input1Extents[0]);
       } else {
         assert(input1Extents.size() == 2 && input2Extents.size() == 2 &&
                "hlfir.matmul arguments must be rank-2 arrays");
         if constexpr (isMatmulTranspose)
           newExtents.push_back(input1Extents[1]);
         else
           newExtents.push_back(input1Extents[0]);

         newExtents.push_back(input2Extents[1]);
       }
       if constexpr (isMatmulTranspose)
         innerProduct1Extent = input1Extents[0];
       else
         innerProduct1Extent = input1Extents[1];

       innerProduct2Extent = input2Extents[0];
     }
     // The inner product dimensions of the input arrays
     // must match. Pick the best (e.g. constant) out of them
     // so that the inner product loop bound can be used in
     // optimizations.
     llvm::SmallVector<mlir::Value> innerProductExtent =
         fir::factory::deduceOptimalExtents({innerProduct1Extent},
                                            {innerProduct2Extent});
     return {fir::ShapeOp::create(builder, loc, newExtents),
             innerProductExtent[0]};
   }

   static mlir::LogicalResult
   genContiguousMatmul(mlir::Location loc, fir::FirOpBuilder &builder,
                       hlfir::Entity result, mlir::Value resultShape,
                       hlfir::Entity lhs, hlfir::Entity rhs,
                       mlir::Value innerProductExtent) {
     // This code does not support MATMUL(TRANSPOSE), and it is supposed
     // to be inlined as hlfir.elemental.
     if constexpr (isMatmulTranspose)
       return mlir::failure();

     mlir::OpBuilder::InsertionGuard guard(builder);
     mlir::Type resultElementType = result.getFortranElementType();
     llvm::SmallVector<mlir::Value, 2> resultExtents =
         mlir::cast<fir::ShapeOp>(resultShape.getDefiningOp()).getExtents();

     // The inner product loop may be unordered if FastMathFlags::reassoc
     // transformations are allowed. The integer/logical inner product is
     // always unordered.
     // Note that isUnordered is currently applied to all loops
     // in the loop nests generated below, while it has to be applied
     // only to one.
     bool isUnordered = mlir::isa<mlir::IntegerType>(resultElementType) ||
                        mlir::isa<fir::LogicalType>(resultElementType) ||
                        static_cast<bool>(builder.getFastMathFlags() &
                                          mlir::arith::FastMathFlags::reassoc);

     // Insert the initialization loop nest that fills the whole result with
     // zeroes.
     mlir::Value initValue =
         fir::factory::createZeroValue(builder, loc, resultElementType);
     auto genInitBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                            mlir::ValueRange oneBasedIndices,
                            mlir::ValueRange reductionArgs)
         -> llvm::SmallVector<mlir::Value, 0> {
       hlfir::Entity resultElement =
           hlfir::getElementAt(loc, builder, result, oneBasedIndices);
       hlfir::AssignOp::create(builder, loc, initValue, resultElement);
       return {};
     };

     hlfir::genLoopNestWithReductions(loc, builder, resultExtents,
                                      /*reductionInits=*/{}, genInitBody,
                                      /*isUnordered=*/true);

     if (lhs.getRank() == 2 && rhs.getRank() == 2) {
       //   LHS(NROWS,N) * RHS(N,NCOLS) -> RESULT(NROWS,NCOLS)
       //
       // Insert the computation loop nest:
       //   DO 2 K = 1, N
       //    DO 2 J = 1, NCOLS
       //     DO 2 I = 1, NROWS
       //   2  RESULT(I,J) = RESULT(I,J) + LHS(I,K)*RHS(K,J)
       auto genMatrixMatrix = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                                  mlir::ValueRange oneBasedIndices,
                                  mlir::ValueRange reductionArgs)
           -> llvm::SmallVector<mlir::Value, 0> {
         mlir::Value I = oneBasedIndices[0];
         mlir::Value J = oneBasedIndices[1];
         mlir::Value K = oneBasedIndices[2];
         hlfir::Entity resultElement =
             hlfir::getElementAt(loc, builder, result, {I, J});
         hlfir::Entity resultElementValue =
             hlfir::loadTrivialScalar(loc, builder, resultElement);
         hlfir::Entity lhsElementValue =
             hlfir::loadElementAt(loc, builder, lhs, {I, K});
         hlfir::Entity rhsElementValue =
             hlfir::loadElementAt(loc, builder, rhs, {K, J});
         mlir::Value productValue =
             ProductFactory{loc, builder}.genAccumulateProduct(
                 resultElementValue, lhsElementValue, rhsElementValue);
         hlfir::AssignOp::create(builder, loc, productValue, resultElement);
         return {};
       };

       // Note that the loops are inserted in reverse order,
       // so innerProductExtent should be passed as the last extent.
       hlfir::genLoopNestWithReductions(
           loc, builder,
           {resultExtents[0], resultExtents[1], innerProductExtent},
           /*reductionInits=*/{}, genMatrixMatrix, isUnordered);
       return mlir::success();
     }

     if (lhs.getRank() == 2 && rhs.getRank() == 1) {
       //   LHS(NROWS,N) * RHS(N) -> RESULT(NROWS)
       //
       // Insert the computation loop nest:
       //   DO 2 K = 1, N
       //    DO 2 J = 1, NROWS
       //   2 RES(J) = RES(J) + LHS(J,K)*RHS(K)
       auto genMatrixVector = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                                  mlir::ValueRange oneBasedIndices,
                                  mlir::ValueRange reductionArgs)
           -> llvm::SmallVector<mlir::Value, 0> {
         mlir::Value J = oneBasedIndices[0];
         mlir::Value K = oneBasedIndices[1];
         hlfir::Entity resultElement =
             hlfir::getElementAt(loc, builder, result, {J});
         hlfir::Entity resultElementValue =
             hlfir::loadTrivialScalar(loc, builder, resultElement);
         hlfir::Entity lhsElementValue =
             hlfir::loadElementAt(loc, builder, lhs, {J, K});
         hlfir::Entity rhsElementValue =
             hlfir::loadElementAt(loc, builder, rhs, {K});
         mlir::Value productValue =
             ProductFactory{loc, builder}.genAccumulateProduct(
                 resultElementValue, lhsElementValue, rhsElementValue);
         hlfir::AssignOp::create(builder, loc, productValue, resultElement);
         return {};
       };
       hlfir::genLoopNestWithReductions(
           loc, builder, {resultExtents[0], innerProductExtent},
           /*reductionInits=*/{}, genMatrixVector, isUnordered);
       return mlir::success();
     }
     if (lhs.getRank() == 1 && rhs.getRank() == 2) {
       //   LHS(N) * RHS(N,NCOLS) -> RESULT(NCOLS)
       //
       // Insert the computation loop nest:
       //   DO 2 K = 1, N
       //    DO 2 J = 1, NCOLS
       //   2 RES(J) = RES(J) + LHS(K)*RHS(K,J)
       auto genVectorMatrix = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                                  mlir::ValueRange oneBasedIndices,
                                  mlir::ValueRange reductionArgs)
           -> llvm::SmallVector<mlir::Value, 0> {
         mlir::Value J = oneBasedIndices[0];
         mlir::Value K = oneBasedIndices[1];
         hlfir::Entity resultElement =
             hlfir::getElementAt(loc, builder, result, {J});
         hlfir::Entity resultElementValue =
             hlfir::loadTrivialScalar(loc, builder, resultElement);
         hlfir::Entity lhsElementValue =
             hlfir::loadElementAt(loc, builder, lhs, {K});
         hlfir::Entity rhsElementValue =
             hlfir::loadElementAt(loc, builder, rhs, {K, J});
         mlir::Value productValue =
             ProductFactory{loc, builder}.genAccumulateProduct(
                 resultElementValue, lhsElementValue, rhsElementValue);
         hlfir::AssignOp::create(builder, loc, productValue, resultElement);
         return {};
       };
       hlfir::genLoopNestWithReductions(
           loc, builder, {resultExtents[0], innerProductExtent},
           /*reductionInits=*/{}, genVectorMatrix, isUnordered);
       return mlir::success();
     }

     llvm_unreachable("unsupported MATMUL arguments' ranks");
   }

   static hlfir::ElementalOp
   genElementalMatmul(mlir::Location loc, fir::FirOpBuilder &builder,
                      hlfir::ExprType resultType, mlir::Value resultShape,
                      hlfir::Entity lhs, hlfir::Entity rhs,
                      mlir::Value innerProductExtent) {
     mlir::OpBuilder::InsertionGuard guard(builder);
     mlir::Type resultElementType = resultType.getElementType();
     auto genKernel = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                          mlir::ValueRange resultIndices) -> hlfir::Entity {
       mlir::Value initValue =
           fir::factory::createZeroValue(builder, loc, resultElementType);
       // The inner product loop may be unordered if FastMathFlags::reassoc
       // transformations are allowed. The integer/logical inner product is
       // always unordered.
       bool isUnordered = mlir::isa<mlir::IntegerType>(resultElementType) ||
                          mlir::isa<fir::LogicalType>(resultElementType) ||
                          static_cast<bool>(builder.getFastMathFlags() &
                                            mlir::arith::FastMathFlags::reassoc);

       auto genBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                          mlir::ValueRange oneBasedIndices,
                          mlir::ValueRange reductionArgs)
           -> llvm::SmallVector<mlir::Value, 1> {
         llvm::SmallVector<mlir::Value, 2> lhsIndices;
         llvm::SmallVector<mlir::Value, 2> rhsIndices;
         // MATMUL:
         //   LHS(NROWS,N) * RHS(N,NCOLS) -> RESULT(NROWS,NCOLS)
         //   LHS(NROWS,N) * RHS(N) -> RESULT(NROWS)
         //   LHS(N) * RHS(N,NCOLS) -> RESULT(NCOLS)
         //
         // MATMUL(TRANSPOSE):
         //   TRANSPOSE(LHS(N,NROWS)) * RHS(N,NCOLS) -> RESULT(NROWS,NCOLS)
         //   TRANSPOSE(LHS(N,NROWS)) * RHS(N) -> RESULT(NROWS)
         //
         // The resultIndices iterate over (NROWS[,NCOLS]).
         // The oneBasedIndices iterate over (N).
         if (lhs.getRank() > 1)
           lhsIndices.push_back(resultIndices[0]);
         lhsIndices.push_back(oneBasedIndices[0]);

         if constexpr (isMatmulTranspose) {
           // Swap the LHS indices for TRANSPOSE.
           std::swap(lhsIndices[0], lhsIndices[1]);
         }

         rhsIndices.push_back(oneBasedIndices[0]);
         if (rhs.getRank() > 1)
           rhsIndices.push_back(resultIndices.back());

         hlfir::Entity lhsElementValue =
             hlfir::loadElementAt(loc, builder, lhs, lhsIndices);
         hlfir::Entity rhsElementValue =
             hlfir::loadElementAt(loc, builder, rhs, rhsIndices);
         mlir::Value productValue =
             ProductFactory{loc, builder}.genAccumulateProduct(
                 reductionArgs[0], lhsElementValue, rhsElementValue);
         return {productValue};
       };
       llvm::SmallVector<mlir::Value, 1> innerProductValue =
           hlfir::genLoopNestWithReductions(loc, builder, {innerProductExtent},
                                            {initValue}, genBody, isUnordered);
       return hlfir::Entity{innerProductValue[0]};
     };
     hlfir::ElementalOp elementalOp = hlfir::genElementalOp(
         loc, builder, resultElementType, resultShape, /*typeParams=*/{},
         genKernel,
         /*isUnordered=*/true, /*polymorphicMold=*/nullptr, resultType);

     return elementalOp;
   }
 };

 class DotProductConversion
     : public mlir::OpRewritePattern<hlfir::DotProductOp> {
 public:
   using mlir::OpRewritePattern<hlfir::DotProductOp>::OpRewritePattern;

   llvm::LogicalResult
   matchAndRewrite(hlfir::DotProductOp product,
                   mlir::PatternRewriter &rewriter) const override {
     hlfir::Entity op = hlfir::Entity{product};
     if (!op.isScalar())
       return rewriter.notifyMatchFailure(product, "produces non-scalar result");

     mlir::Location loc = product.getLoc();
     fir::FirOpBuilder builder{rewriter, product.getOperation()};
     hlfir::Entity lhs = hlfir::Entity{product.getLhs()};
     hlfir::Entity rhs = hlfir::Entity{product.getRhs()};
     mlir::Type resultElementType = product.getType();
     bool isUnordered = mlir::isa<mlir::IntegerType>(resultElementType) ||
                        mlir::isa<fir::LogicalType>(resultElementType) ||
                        static_cast<bool>(builder.getFastMathFlags() &
                                          mlir::arith::FastMathFlags::reassoc);

     mlir::Value extent = genProductExtent(loc, builder, lhs, rhs);

     auto genBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                        mlir::ValueRange oneBasedIndices,
                        mlir::ValueRange reductionArgs)
         -> llvm::SmallVector<mlir::Value, 1> {
       hlfir::Entity lhsElementValue =
           hlfir::loadElementAt(loc, builder, lhs, oneBasedIndices);
       hlfir::Entity rhsElementValue =
           hlfir::loadElementAt(loc, builder, rhs, oneBasedIndices);
       mlir::Value productValue =
           ProductFactory{loc, builder}.genAccumulateProduct</*CONJ=*/true>(
               reductionArgs[0], lhsElementValue, rhsElementValue);
       return {productValue};
     };

     mlir::Value initValue =
         fir::factory::createZeroValue(builder, loc, resultElementType);

     llvm::SmallVector<mlir::Value, 1> result = hlfir::genLoopNestWithReductions(
         loc, builder, {extent},
         /*reductionInits=*/{initValue}, genBody, isUnordered);

     rewriter.replaceOp(product, result[0]);
     return mlir::success();
   }

 private:
   static mlir::Value genProductExtent(mlir::Location loc,
                                       fir::FirOpBuilder &builder,
                                       hlfir::Entity input1,
                                       hlfir::Entity input2) {
     llvm::SmallVector<mlir::Value, 1> input1Extents =
         hlfir::genExtentsVector(loc, builder, input1);
     llvm::SmallVector<mlir::Value, 1> input2Extents =
         hlfir::genExtentsVector(loc, builder, input2);

     assert(input1Extents.size() == 1 && input2Extents.size() == 1 &&
            "hlfir.dot_product arguments must be vectors");
     llvm::SmallVector<mlir::Value, 1> extent =
         fir::factory::deduceOptimalExtents(input1Extents, input2Extents);
     return extent[0];
   }
 };

 class ReshapeAsElementalConversion
     : public mlir::OpRewritePattern<hlfir::ReshapeOp> {
 public:
   using mlir::OpRewritePattern<hlfir::ReshapeOp>::OpRewritePattern;

   llvm::LogicalResult
   matchAndRewrite(hlfir::ReshapeOp reshape,
                   mlir::PatternRewriter &rewriter) const override {
     // Do not inline RESHAPE with ORDER yet. The runtime implementation
     // may be good enough, unless the temporary creation overhead
     // is high.
     // TODO: If ORDER is constant, then we can still easily inline.
     // TODO: If the result's rank is 1, then we can assume ORDER == (/1/).
     if (reshape.getOrder())
       return rewriter.notifyMatchFailure(reshape,
                                          "RESHAPE with ORDER argument");

     // Verify that the element types of ARRAY, PAD and the result
     // match before doing any transformations. For example,
     // the character types of different lengths may appear in the dead
     // code, and it just does not make sense to inline hlfir.reshape
     // in this case (a runtime call might have less code size footprint).
     hlfir::Entity result = hlfir::Entity{reshape};
     hlfir::Entity array = hlfir::Entity{reshape.getArray()};
     mlir::Type elementType = array.getFortranElementType();
     if (result.getFortranElementType() != elementType)
       return rewriter.notifyMatchFailure(
           reshape, "ARRAY and result have different types");
     mlir::Value pad = reshape.getPad();
     if (pad && hlfir::getFortranElementType(pad.getType()) != elementType)
       return rewriter.notifyMatchFailure(reshape,
                                          "ARRAY and PAD have different types");

     // TODO: selecting between ARRAY and PAD of non-trivial element types
     // requires more work. We have to select between two references
     // to elements in ARRAY and PAD. This requires conditional
     // bufferization of the element, if ARRAY/PAD is an expression.
     if (pad && !fir::isa_trivial(elementType))
       return rewriter.notifyMatchFailure(reshape,
                                          "PAD present with non-trivial type");

     mlir::Location loc = reshape.getLoc();
     fir::FirOpBuilder builder{rewriter, reshape.getOperation()};
     // Assume that all the indices arithmetic does not overflow
     // the IndexType.
     builder.setIntegerOverflowFlags(mlir::arith::IntegerOverflowFlags::nuw);

     llvm::SmallVector<mlir::Value, 1> typeParams;
     hlfir::genLengthParameters(loc, builder, array, typeParams);

     // Fetch the extents of ARRAY, PAD and result beforehand.
     llvm::SmallVector<mlir::Value, Fortran::common::maxRank> arrayExtents =
         hlfir::genExtentsVector(loc, builder, array);

     // If PAD is present, we have to use array size to start taking
     // elements from the PAD array.
     mlir::Value arraySize =
         pad ? computeArraySize(loc, builder, arrayExtents) : nullptr;
     hlfir::Entity shape = hlfir::Entity{reshape.getShape()};
     llvm::SmallVector<mlir::Value, Fortran::common::maxRank> resultExtents;
     mlir::Type indexType = builder.getIndexType();
     for (int idx = 0; idx < result.getRank(); ++idx)
       resultExtents.push_back(hlfir::loadElementAt(
           loc, builder, shape,
           builder.createIntegerConstant(loc, indexType, idx + 1)));
     auto resultShape = fir::ShapeOp::create(builder, loc, resultExtents);

     auto genKernel = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                          mlir::ValueRange inputIndices) -> hlfir::Entity {
       mlir::Value linearIndex =
           computeLinearIndex(loc, builder, resultExtents, inputIndices);
       fir::IfOp ifOp;
       if (pad) {
         // PAD is present. Check if this element comes from the PAD array.
         mlir::Value isInsideArray = mlir::arith::CmpIOp::create(
             builder, loc, mlir::arith::CmpIPredicate::ult, linearIndex,
             arraySize);
         ifOp = fir::IfOp::create(builder, loc, elementType, isInsideArray,
                                  /*withElseRegion=*/true);

         // In the 'else' block, return an element from the PAD.
         builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
         // PAD is dynamically optional, but we can unconditionally access it
         // in the 'else' block. If we have to start taking elements from it,
         // then it must be present in a valid program.
         llvm::SmallVector<mlir::Value, Fortran::common::maxRank> padExtents =
             hlfir::genExtentsVector(loc, builder, hlfir::Entity{pad});
         // Subtract the ARRAY size from the zero-based linear index
         // to get the zero-based linear index into PAD.
         mlir::Value padLinearIndex =
             mlir::arith::SubIOp::create(builder, loc, linearIndex, arraySize);
         llvm::SmallVector<mlir::Value, Fortran::common::maxRank> padIndices =
             delinearizeIndex(loc, builder, padExtents, padLinearIndex,
                              /*wrapAround=*/true);
         mlir::Value padElement =
             hlfir::loadElementAt(loc, builder, hlfir::Entity{pad}, padIndices);
         fir::ResultOp::create(builder, loc, padElement);

         // In the 'then' block, return an element from the ARRAY.
         builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
       }

       llvm::SmallVector<mlir::Value, Fortran::common::maxRank> arrayIndices =
           delinearizeIndex(loc, builder, arrayExtents, linearIndex,
                            /*wrapAround=*/false);
       mlir::Value arrayElement =
           hlfir::loadElementAt(loc, builder, array, arrayIndices);

       if (ifOp) {
         fir::ResultOp::create(builder, loc, arrayElement);
         builder.setInsertionPointAfter(ifOp);
         arrayElement = ifOp.getResult(0);
       }

       return hlfir::Entity{arrayElement};
     };
     hlfir::ElementalOp elementalOp = hlfir::genElementalOp(
         loc, builder, elementType, resultShape, typeParams, genKernel,
         /*isUnordered=*/true,
         /*polymorphicMold=*/result.isPolymorphic() ? array : mlir::Value{},
         reshape.getResult().getType());
     assert(elementalOp.getResult().getType() == reshape.getResult().getType());
     rewriter.replaceOp(reshape, elementalOp);
     return mlir::success();
   }

 private:
   /// Compute zero-based linear index given an array extents
   /// and one-based indices:
   ///   \p extents: [e0, e1, ..., en]
   ///   \p indices: [i0, i1, ..., in]
   ///
   /// linear-index :=
   ///   (...((in-1)*e(n-1)+(i(n-1)-1))*e(n-2)+...)*e0+(i0-1)
   static mlir::Value computeLinearIndex(mlir::Location loc,
                                         fir::FirOpBuilder &builder,
                                         mlir::ValueRange extents,
                                         mlir::ValueRange indices) {
     std::size_t rank = extents.size();
     assert(rank == indices.size());
     mlir::Type indexType = builder.getIndexType();
     mlir::Value zero = builder.createIntegerConstant(loc, indexType, 0);
     mlir::Value one = builder.createIntegerConstant(loc, indexType, 1);
     mlir::Value linearIndex = zero;
     std::size_t idx = 0;
     for (auto index : llvm::reverse(indices)) {
       mlir::Value tmp = mlir::arith::SubIOp::create(
           builder, loc, builder.createConvert(loc, indexType, index), one);
       tmp = mlir::arith::AddIOp::create(builder, loc, linearIndex, tmp);
       if (idx + 1 < rank)
         tmp = mlir::arith::MulIOp::create(
             builder, loc, tmp,
             builder.createConvert(loc, indexType, extents[rank - idx - 2]));

       linearIndex = tmp;
       ++idx;
     }
     return linearIndex;
   }

   /// Compute one-based array indices from the given zero-based \p linearIndex
   /// and the array \p extents [e0, e1, ..., en].
   ///   i0 := linearIndex % e0 + 1
   ///   linearIndex := linearIndex / e0
   ///   i1 := linearIndex % e1 + 1
   ///   linearIndex := linearIndex / e1
   ///   ...
   ///   i(n-1) := linearIndex % e(n-1) + 1
   ///   linearIndex := linearIndex / e(n-1)
   ///   if (wrapAround) {
   ///     // If the index is allowed to wrap around, then
   ///     // we need to modulo it by the last dimension's extent.
   ///     in := linearIndex % en + 1
   ///   } else {
   ///     in := linearIndex + 1
   ///   }
   static llvm::SmallVector<mlir::Value, Fortran::common::maxRank>
   delinearizeIndex(mlir::Location loc, fir::FirOpBuilder &builder,
                    mlir::ValueRange extents, mlir::Value linearIndex,
                    bool wrapAround) {
     llvm::SmallVector<mlir::Value, Fortran::common::maxRank> indices;
     mlir::Type indexType = builder.getIndexType();
     mlir::Value one = builder.createIntegerConstant(loc, indexType, 1);
     linearIndex = builder.createConvert(loc, indexType, linearIndex);

     for (std::size_t dim = 0; dim < extents.size(); ++dim) {
       mlir::Value extent = builder.createConvert(loc, indexType, extents[dim]);
       // Avoid the modulo for the last index, unless wrap around is allowed.
       mlir::Value currentIndex = linearIndex;
       if (dim != extents.size() - 1 || wrapAround)
         currentIndex =
             mlir::arith::RemUIOp::create(builder, loc, linearIndex, extent);
       // The result of the last division is unused, so it will be DCEd.
       linearIndex =
           mlir::arith::DivUIOp::create(builder, loc, linearIndex, extent);
       indices.push_back(
           mlir::arith::AddIOp::create(builder, loc, currentIndex, one));
     }
     return indices;
   }

   /// Return size of an array given its extents.
   static mlir::Value computeArraySize(mlir::Location loc,
                                       fir::FirOpBuilder &builder,
                                       mlir::ValueRange extents) {
     mlir::Type indexType = builder.getIndexType();
     mlir::Value size = builder.createIntegerConstant(loc, indexType, 1);
     for (auto extent : extents)
       size = mlir::arith::MulIOp::create(
           builder, loc, size, builder.createConvert(loc, indexType, extent));
     return size;
   }
 };

 class SimplifyHLFIRIntrinsics
     : public hlfir::impl::SimplifyHLFIRIntrinsicsBase<SimplifyHLFIRIntrinsics> {
 public:
   using SimplifyHLFIRIntrinsicsBase<
       SimplifyHLFIRIntrinsics>::SimplifyHLFIRIntrinsicsBase;

   void runOnOperation() override {
     mlir::MLIRContext *context = &getContext();

     mlir::GreedyRewriteConfig config;
     // Prevent the pattern driver from merging blocks
     config.setRegionSimplificationLevel(
         mlir::GreedySimplifyRegionLevel::Disabled);

     mlir::RewritePatternSet patterns(context);
     patterns.insert<TransposeAsElementalConversion>(context);
     patterns.insert<ReductionConversion<hlfir::SumOp>>(context);
     patterns.insert<ReductionConversion<hlfir::ProductOp>>(context);
     patterns.insert<ArrayShiftConversion<hlfir::CShiftOp>>(context);
     patterns.insert<ArrayShiftConversion<hlfir::EOShiftOp>>(context);
     patterns.insert<CmpCharOpConversion>(context);
     patterns.insert<IndexOpConversion>(context);
     patterns.insert<MatmulConversion<hlfir::MatmulTransposeOp>>(context);
     patterns.insert<ReductionConversion<hlfir::CountOp>>(context);
     patterns.insert<ReductionConversion<hlfir::AnyOp>>(context);
     patterns.insert<ReductionConversion<hlfir::AllOp>>(context);
     patterns.insert<ReductionConversion<hlfir::MaxlocOp>>(context);
     patterns.insert<ReductionConversion<hlfir::MinlocOp>>(context);
     patterns.insert<ReductionConversion<hlfir::MaxvalOp>>(context);
     patterns.insert<ReductionConversion<hlfir::MinvalOp>>(context);

     // If forceMatmulAsElemental is false, then hlfir.matmul inlining
     // will introduce hlfir.eval_in_mem operation with new memory side
     // effects. This conflicts with CSE and optimized bufferization, e.g.:
     //   A(1:N,1:N) =  A(1:N,1:N) - MATMUL(...)
     // If we introduce hlfir.eval_in_mem before CSE, then the current
     // MLIR CSE won't be able to optimize the trivial loads of 'N' value
     // that happen before and after hlfir.matmul.
     // If 'N' loads are not optimized, then the optimized bufferization
     // won't be able to prove that the slices of A are identical
     // on both sides of the assignment.
     // This is actually the CSE problem, but we can work it around
     // for the time being.
     if (forceMatmulAsElemental || this->allowNewSideEffects)
       patterns.insert<MatmulConversion<hlfir::MatmulOp>>(context);

     patterns.insert<DotProductConversion>(context);
     patterns.insert<ReshapeAsElementalConversion>(context);

     if (mlir::failed(mlir::applyPatternsGreedily(
             getOperation(), std::move(patterns), config))) {
       mlir::emitError(getOperation()->getLoc(),
                       "failure in HLFIR intrinsic simplification");
       signalPassFailure();
     }
   }
 };
 } // namespace