clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp - llvm-project - Git at Google

 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // Helperes to emit OpenACC clause recipes as CIR code.
 //
 //===----------------------------------------------------------------------===//

 #include <numeric>

 #include "CIRGenOpenACCRecipe.h"

 namespace clang::CIRGen {
 mlir::Block *OpenACCRecipeBuilderBase::createRecipeBlock(mlir::Region &region,
                                                          mlir::Type opTy,
                                                          mlir::Location loc,
                                                          size_t numBounds,
                                                          bool isInit) {
   llvm::SmallVector<mlir::Type> types;
   types.reserve(numBounds + 2);
   types.push_back(opTy);
   // The init section is the only one that doesn't have TWO copies of the
   // operation-type.  Copy has a to/from, and destroy has a
   // 'reference'/'privatized' copy version.
   if (!isInit)
     types.push_back(opTy);

   auto boundsTy = mlir::acc::DataBoundsType::get(&cgf.getMLIRContext());
   for (size_t i = 0; i < numBounds; ++i)
     types.push_back(boundsTy);

   llvm::SmallVector<mlir::Location> locs{types.size(), loc};
   return builder.createBlock(&region, region.end(), types, locs);
 }
 void OpenACCRecipeBuilderBase::makeAllocaCopy(mlir::Location loc,
                                               mlir::Type copyType,
                                               mlir::Value numEltsToCopy,
                                               mlir::Value offsetPerSubarray,
                                               mlir::Value destAlloca,
                                               mlir::Value srcAlloca) {
   mlir::OpBuilder::InsertionGuard guardCase(builder);

   mlir::Type itrTy = cgf.cgm.convertType(cgf.getContext().UnsignedLongLongTy);
   auto itrPtrTy = cir::PointerType::get(itrTy);
   mlir::IntegerAttr itrAlign =
       cgf.cgm.getSize(cgf.getContext().getTypeAlignInChars(
           cgf.getContext().UnsignedLongLongTy));

   auto loopBuilder = [&]() {
     auto itr =
         cir::AllocaOp::create(builder, loc, itrPtrTy, itrTy, "itr", itrAlign);
     cir::ConstantOp constZero = builder.getConstInt(loc, itrTy, 0);
     builder.CIRBaseBuilderTy::createStore(loc, constZero, itr);
     builder.createFor(
         loc,
         /*condBuilder=*/
         [&](mlir::OpBuilder &b, mlir::Location loc) {
           // itr < numEltsToCopy
           // Enforce a trip count of 1 if there wasn't any element count, this
           // way we can just use this loop with a constant bounds instead of a
           // separate code path.
           if (!numEltsToCopy)
             numEltsToCopy = builder.getConstInt(loc, itrTy, 1);

           auto loadCur = cir::LoadOp::create(builder, loc, {itr});
           auto cmp = builder.createCompare(loc, cir::CmpOpKind::lt, loadCur,
                                            numEltsToCopy);
           builder.createCondition(cmp);
         },
         /*bodyBuilder=*/
         [&](mlir::OpBuilder &b, mlir::Location loc) {
           // destAlloca[itr] = srcAlloca[offsetPerSubArray * itr];
           auto loadCur = cir::LoadOp::create(builder, loc, {itr});
           auto srcOffset = builder.createMul(loc, offsetPerSubarray, loadCur);

           auto ptrToOffsetIntoSrc = cir::PtrStrideOp::create(
               builder, loc, copyType, srcAlloca, srcOffset);

           auto offsetIntoDecayDest = cir::PtrStrideOp::create(
               builder, loc, builder.getPointerTo(copyType), destAlloca,
               loadCur);

           builder.CIRBaseBuilderTy::createStore(loc, ptrToOffsetIntoSrc,
                                                 offsetIntoDecayDest);
           builder.createYield(loc);
         },
         /*stepBuilder=*/
         [&](mlir::OpBuilder &b, mlir::Location loc) {
           // Simple increment of the iterator.
           auto load = cir::LoadOp::create(builder, loc, {itr});
           auto inc = cir::UnaryOp::create(builder, loc, load.getType(),
                                           cir::UnaryOpKind::Inc, load);
           builder.CIRBaseBuilderTy::createStore(loc, inc, itr);
           builder.createYield(loc);
         });
   };

   cir::ScopeOp::create(builder, loc,
                        [&](mlir::OpBuilder &b, mlir::Location loc) {
                          loopBuilder();
                          builder.createYield(loc);
                        });
 }

 mlir::Value OpenACCRecipeBuilderBase::makeBoundsAlloca(
     mlir::Block *block, SourceRange exprRange, mlir::Location loc,
     std::string_view allocaName, size_t numBounds,
     llvm::ArrayRef<QualType> boundTypes) {
   mlir::OpBuilder::InsertionGuard guardCase(builder);

   // Get the range of bounds arguments, which are all but the 1st arg.
   llvm::ArrayRef<mlir::BlockArgument> boundsRange =
       block->getArguments().drop_front(1);

   // boundTypes contains the before and after of each bounds, so it ends up
   // having 1 extra. Assert this is the case to ensure we don't call this in the
   // wrong 'block'.
   assert(boundsRange.size() + 1 == boundTypes.size());

   mlir::Type itrTy = cgf.cgm.convertType(cgf.getContext().UnsignedLongLongTy);
   auto idxType = mlir::IndexType::get(&cgf.getMLIRContext());

   auto getUpperBound = [&](mlir::Value bound) {
     auto upperBoundVal =
         mlir::acc::GetUpperboundOp::create(builder, loc, idxType, bound);
     return mlir::UnrealizedConversionCastOp::create(builder, loc, itrTy,
                                                     upperBoundVal.getResult())
         .getResult(0);
   };

   auto isArrayTy = [&](QualType ty) {
     if (ty->isArrayType() && !ty->isConstantArrayType())
       cgf.cgm.errorNYI(exprRange, "OpenACC recipe init for VLAs");
     return ty->isConstantArrayType();
   };

   mlir::Type topLevelTy = cgf.convertType(boundTypes.back());
   cir::PointerType topLevelTyPtr = builder.getPointerTo(topLevelTy);
   // Do an alloca for the 'top' level type without bounds.
   mlir::Value initialAlloca = builder.createAlloca(
       loc, topLevelTyPtr, topLevelTy, allocaName,
       cgf.getContext().getTypeAlignInChars(boundTypes.back()));

   bool lastBoundWasArray = isArrayTy(boundTypes.back());

   // Make sure we track a moving version of this so we can get our
   // 'copying' back to correct.
   mlir::Value lastAlloca = initialAlloca;

   // Since we're iterating the types in reverse, this sets up for each index
   // corresponding to the boundsRange to be the 'after application of the
   // bounds.
   llvm::ArrayRef<QualType> boundResults = boundTypes.drop_back(1);

   // Collect the 'do we have any allocas needed after this type' list.
   llvm::SmallVector<bool> allocasLeftArr;
   llvm::ArrayRef<QualType> resultTypes = boundTypes.drop_front();
   std::transform_inclusive_scan(
       resultTypes.begin(), resultTypes.end(),
       std::back_inserter(allocasLeftArr), std::plus<bool>{},
       [](QualType ty) { return !ty->isConstantArrayType(); }, false);

   // Keep track of the number of 'elements' that we're allocating. Individual
   // allocas should multiply this by the size of its current allocation.
   mlir::Value cumulativeElts;
   for (auto [bound, resultType, allocasLeft] : llvm::reverse(
            llvm::zip_equal(boundsRange, boundResults, allocasLeftArr))) {

     // if there is no further 'alloca' operation we need to do, we can skip
     // creating the UB/multiplications/etc.
     if (!allocasLeft)
       break;

     // First: figure out the number of elements in the current 'bound' list.
     mlir::Value eltsPerSubArray = getUpperBound(bound);
     mlir::Value eltsToAlloca;

     // IF we are in a sub-bounds, the total number of elements to alloca is
     // the product of that one and the current 'bounds' size.  That is,
     // arr[5][5], we would need 25 elements, not just 5. Else it is just the
     // current number of elements.
     if (cumulativeElts)
       eltsToAlloca = builder.createMul(loc, eltsPerSubArray, cumulativeElts);
     else
       eltsToAlloca = eltsPerSubArray;

     if (!lastBoundWasArray) {
       // If we have to do an allocation, figure out the size of the
       // allocation.  alloca takes the number of bytes, not elements.
       TypeInfoChars eltInfo = cgf.getContext().getTypeInfoInChars(resultType);
       cir::ConstantOp eltSize = builder.getConstInt(
           loc, itrTy, eltInfo.Width.alignTo(eltInfo.Align).getQuantity());
       mlir::Value curSize = builder.createMul(loc, eltsToAlloca, eltSize);

       mlir::Type eltTy = cgf.convertType(resultType);
       cir::PointerType ptrTy = builder.getPointerTo(eltTy);
       mlir::Value curAlloca = builder.createAlloca(
           loc, ptrTy, eltTy, "openacc.init.bounds",
           cgf.getContext().getTypeAlignInChars(resultType), curSize);

       makeAllocaCopy(loc, ptrTy, cumulativeElts, eltsPerSubArray, lastAlloca,
                      curAlloca);
       lastAlloca = curAlloca;
     } else {
       // In the case of an array, we just need to decay the pointer, so just do
       // a zero-offset stride on the last alloca to decay it down an array
       // level.
       cir::ConstantOp constZero = builder.getConstInt(loc, itrTy, 0);
       lastAlloca = builder.getArrayElement(loc, loc, lastAlloca,
                                            cgf.convertType(resultType),
                                            constZero, /*shouldDecay=*/true);
     }

     cumulativeElts = eltsToAlloca;
     lastBoundWasArray = isArrayTy(resultType);
   }
   return initialAlloca;
 }

 std::pair<mlir::Value, mlir::Value> OpenACCRecipeBuilderBase::createBoundsLoop(
     mlir::Value subscriptedValue, mlir::Value subscriptedValue2,
     mlir::Value bound, mlir::Location loc, bool inverse) {
   mlir::Operation *bodyInsertLoc;

   mlir::Type itrTy = cgf.cgm.convertType(cgf.getContext().UnsignedLongLongTy);
   auto itrPtrTy = cir::PointerType::get(itrTy);
   mlir::IntegerAttr itrAlign =
       cgf.cgm.getSize(cgf.getContext().getTypeAlignInChars(
           cgf.getContext().UnsignedLongLongTy));
   auto idxType = mlir::IndexType::get(&cgf.getMLIRContext());

   auto doSubscriptOp = [&](mlir::Value subVal,
                            cir::LoadOp idxLoad) -> mlir::Value {
     auto eltTy = cast<cir::PointerType>(subVal.getType()).getPointee();

     if (auto arrayTy = dyn_cast<cir::ArrayType>(eltTy))
       return builder.getArrayElement(loc, loc, subVal, arrayTy.getElementType(),
                                      idxLoad,
                                      /*shouldDecay=*/true);

     assert(isa<cir::PointerType>(eltTy));

     auto eltLoad = cir::LoadOp::create(builder, loc, {subVal});

     return cir::PtrStrideOp::create(builder, loc, eltLoad.getType(), eltLoad,
                                     idxLoad);
   };

   auto forStmtBuilder = [&]() {
     // get the lower and upper bound for iterating over.
     auto lowerBoundVal =
         mlir::acc::GetLowerboundOp::create(builder, loc, idxType, bound);
     auto lbConversion = mlir::UnrealizedConversionCastOp::create(
         builder, loc, itrTy, lowerBoundVal.getResult());
     auto upperBoundVal =
         mlir::acc::GetUpperboundOp::create(builder, loc, idxType, bound);
     auto ubConversion = mlir::UnrealizedConversionCastOp::create(
         builder, loc, itrTy, upperBoundVal.getResult());

     // Create a memory location for the iterator.
     auto itr =
         cir::AllocaOp::create(builder, loc, itrPtrTy, itrTy, "iter", itrAlign);
     // Store to the iterator: either lower bound, or if inverse loop, upper
     // bound.
     if (inverse) {
       cir::ConstantOp constOne = builder.getConstInt(loc, itrTy, 1);

       auto sub = cir::BinOp::create(builder, loc, itrTy, cir::BinOpKind::Sub,
                                     ubConversion.getResult(0), constOne);

       // Upperbound is exclusive, so subtract 1.
       builder.CIRBaseBuilderTy::createStore(loc, sub, itr);
     } else {
       // Lowerbound is inclusive, so we can include it.
       builder.CIRBaseBuilderTy::createStore(loc, lbConversion.getResult(0),
                                             itr);
     }
     // Save the 'end' iterator based on whether we are inverted or not. This
     // end iterator never changes, so we can just get it and convert it, so no
     // need to store/load/etc.
     auto endItr = inverse ? lbConversion : ubConversion;

     builder.createFor(
         loc,
         /*condBuilder=*/
         [&](mlir::OpBuilder &b, mlir::Location loc) {
           auto loadCur = cir::LoadOp::create(builder, loc, {itr});
           // Use 'not equal' since we are just doing an increment/decrement.
           auto cmp = builder.createCompare(
               loc, inverse ? cir::CmpOpKind::ge : cir::CmpOpKind::lt, loadCur,
               endItr.getResult(0));
           builder.createCondition(cmp);
         },
         /*bodyBuilder=*/
         [&](mlir::OpBuilder &b, mlir::Location loc) {
           auto load = cir::LoadOp::create(builder, loc, {itr});

           if (subscriptedValue)
             subscriptedValue = doSubscriptOp(subscriptedValue, load);
           if (subscriptedValue2)
             subscriptedValue2 = doSubscriptOp(subscriptedValue2, load);
           bodyInsertLoc = builder.createYield(loc);
         },
         /*stepBuilder=*/
         [&](mlir::OpBuilder &b, mlir::Location loc) {
           auto load = cir::LoadOp::create(builder, loc, {itr});
           auto unary = cir::UnaryOp::create(
               builder, loc, load.getType(),
               inverse ? cir::UnaryOpKind::Dec : cir::UnaryOpKind::Inc, load);
           builder.CIRBaseBuilderTy::createStore(loc, unary, itr);
           builder.createYield(loc);
         });
   };

   cir::ScopeOp::create(builder, loc,
                        [&](mlir::OpBuilder &b, mlir::Location loc) {
                          forStmtBuilder();
                          builder.createYield(loc);
                        });

   // Leave the insertion point to be inside the body, so we can loop over
   // these things.
   builder.setInsertionPoint(bodyInsertLoc);
   return {subscriptedValue, subscriptedValue2};
 }

 mlir::acc::ReductionOperator
 OpenACCRecipeBuilderBase::convertReductionOp(OpenACCReductionOperator op) {
   switch (op) {
   case OpenACCReductionOperator::Addition:
     return mlir::acc::ReductionOperator::AccAdd;
   case OpenACCReductionOperator::Multiplication:
     return mlir::acc::ReductionOperator::AccMul;
   case OpenACCReductionOperator::Max:
     return mlir::acc::ReductionOperator::AccMax;
   case OpenACCReductionOperator::Min:
     return mlir::acc::ReductionOperator::AccMin;
   case OpenACCReductionOperator::BitwiseAnd:
     return mlir::acc::ReductionOperator::AccIand;
   case OpenACCReductionOperator::BitwiseOr:
     return mlir::acc::ReductionOperator::AccIor;
   case OpenACCReductionOperator::BitwiseXOr:
     return mlir::acc::ReductionOperator::AccXor;
   case OpenACCReductionOperator::And:
     return mlir::acc::ReductionOperator::AccLand;
   case OpenACCReductionOperator::Or:
     return mlir::acc::ReductionOperator::AccLor;
   case OpenACCReductionOperator::Invalid:
     llvm_unreachable("invalid reduction operator");
   }

   llvm_unreachable("invalid reduction operator");
 }

 // This function generates the 'destroy' section for a recipe. Note
 // that this function is not 'insertion point' clean, in that it alters the
 // insertion point to be inside of the 'destroy' section of the recipe, but
 // doesn't restore it aftewards.
 void OpenACCRecipeBuilderBase::createRecipeDestroySection(
     mlir::Location loc, mlir::Location locEnd, mlir::Value mainOp,
     CharUnits alignment, QualType origType, size_t numBounds, QualType baseType,
     mlir::Region &destroyRegion) {
   mlir::Block *block = createRecipeBlock(destroyRegion, mainOp.getType(), loc,
                                          numBounds, /*isInit=*/false);
   builder.setInsertionPointToEnd(&destroyRegion.back());
   CIRGenFunction::LexicalScope ls(cgf, loc, block);

   mlir::Type elementTy =
       mlir::cast<cir::PointerType>(mainOp.getType()).getPointee();
   auto emitDestroy = [&](mlir::Value var, mlir::Type ty) {
     Address addr{var, ty, alignment};
     cgf.emitDestroy(addr, origType,
                     cgf.getDestroyer(QualType::DK_cxx_destructor));
   };

   if (numBounds) {
     mlir::OpBuilder::InsertionGuard guardCase(builder);
     // Get the range of bounds arguments, which are all but the 1st 2. 1st is
     // a 'reference', 2nd is the 'private' variant we need to destroy from.
     llvm::MutableArrayRef<mlir::BlockArgument> boundsRange =
         block->getArguments().drop_front(2);

     mlir::Value subscriptedValue = block->getArgument(1);
     for (mlir::BlockArgument boundArg : llvm::reverse(boundsRange))
       subscriptedValue = createBoundsLoop(subscriptedValue, boundArg, loc,
                                           /*inverse=*/true);

     emitDestroy(subscriptedValue, cgf.cgm.convertType(origType));
   } else {
     // If we don't have any bounds, we can just destroy the variable directly.
     // The destroy region has a signature of "original item, privatized item".
     // So the 2nd item is the one that needs destroying, the former is just
     // for reference and we don't really have a need for it at the moment.
     emitDestroy(block->getArgument(1), elementTy);
   }

   ls.forceCleanup();
   mlir::acc::YieldOp::create(builder, locEnd);
 }
 void OpenACCRecipeBuilderBase::makeBoundsInit(
     mlir::Value alloca, mlir::Location loc, mlir::Block *block,
     const VarDecl *allocaDecl, QualType origType, bool isInitSection) {
   mlir::OpBuilder::InsertionGuard guardCase(builder);
   builder.setInsertionPointToEnd(block);
   CIRGenFunction::LexicalScope ls(cgf, loc, block);

   CIRGenFunction::AutoVarEmission tempDeclEmission{*allocaDecl};
   tempDeclEmission.emittedAsOffload = true;

   // The init section is the only one of the handful that only has a single
   // argument for the 'type', so we have to drop 1 for init, and future calls
   // to this will need to drop 2.
   llvm::MutableArrayRef<mlir::BlockArgument> boundsRange =
       block->getArguments().drop_front(isInitSection ? 1 : 2);

   mlir::Value subscriptedValue = alloca;
   for (mlir::BlockArgument boundArg : llvm::reverse(boundsRange))
     subscriptedValue = createBoundsLoop(subscriptedValue, boundArg, loc,
                                         /*inverse=*/false);

   tempDeclEmission.setAllocatedAddress(
       Address{subscriptedValue, cgf.convertType(origType),
               cgf.getContext().getDeclAlign(allocaDecl)});
   cgf.emitAutoVarInit(tempDeclEmission);
 }

 // TODO: OpenACC: when we start doing firstprivate for array/vlas/etc, we
 // probably need to do a little work about the 'init' calls to put it in 'copy'
 // region instead.
 void OpenACCRecipeBuilderBase::createInitRecipe(
     mlir::Location loc, mlir::Location locEnd, SourceRange exprRange,
     mlir::Value mainOp, mlir::Region &recipeInitRegion, size_t numBounds,
     llvm::ArrayRef<QualType> boundTypes, const VarDecl *allocaDecl,
     QualType origType, bool emitInitExpr) {
   assert(allocaDecl && "Required recipe variable not set?");
   CIRGenFunction::DeclMapRevertingRAII declMapRAII{cgf, allocaDecl};

   mlir::Block *block = createRecipeBlock(recipeInitRegion, mainOp.getType(),
                                          loc, numBounds, /*isInit=*/true);
   builder.setInsertionPointToEnd(&recipeInitRegion.back());
   CIRGenFunction::LexicalScope ls(cgf, loc, block);

   const Type *allocaPointeeType =
       allocaDecl->getType()->getPointeeOrArrayElementType();
   // We are OK with no init for builtins, arrays of builtins, or pointers,
   // else we should NYI so we know to go look for these.
   if (cgf.getContext().getLangOpts().CPlusPlus && !allocaDecl->getInit() &&
       !allocaDecl->getType()->isPointerType() &&
       !allocaPointeeType->isBuiltinType() &&
       !allocaPointeeType->isPointerType()) {
     // If we don't have any initialization recipe, we failed during Sema to
     // initialize this correctly. If we disable the
     // Sema::TentativeAnalysisScopes in SemaOpenACC::CreateInitRecipe, it'll
     // emit an error to tell us.  However, emitting those errors during
     // production is a violation of the standard, so we cannot do them.
     cgf.cgm.errorNYI(exprRange, "private/reduction default-init recipe");
   }

   if (!numBounds) {
     // This is an 'easy' case, we just have to use the builtin init stuff to
     // initialize this variable correctly.
     CIRGenFunction::AutoVarEmission tempDeclEmission =
         cgf.emitAutoVarAlloca(*allocaDecl, builder.saveInsertionPoint());
     if (emitInitExpr)
       cgf.emitAutoVarInit(tempDeclEmission);
   } else {
     mlir::Value alloca = makeBoundsAlloca(
         block, exprRange, loc, allocaDecl->getName(), numBounds, boundTypes);

     // If the initializer is trivial, there is nothing to do here, so save
     // ourselves some effort.
     if (emitInitExpr && allocaDecl->getInit() &&
         (!cgf.isTrivialInitializer(allocaDecl->getInit()) ||
          cgf.getContext().getLangOpts().getTrivialAutoVarInit() !=
              LangOptions::TrivialAutoVarInitKind::Uninitialized))
       makeBoundsInit(alloca, loc, block, allocaDecl, origType,
                      /*isInitSection=*/true);
   }

   ls.forceCleanup();
   mlir::acc::YieldOp::create(builder, locEnd);
 }

 void OpenACCRecipeBuilderBase::createFirstprivateRecipeCopy(
     mlir::Location loc, mlir::Location locEnd, mlir::Value mainOp,
     const VarDecl *allocaDecl, const VarDecl *temporary,
     mlir::Region &copyRegion, size_t numBounds) {
   mlir::Block *block = createRecipeBlock(copyRegion, mainOp.getType(), loc,
                                          numBounds, /*isInit=*/false);
   builder.setInsertionPointToEnd(&copyRegion.back());
   CIRGenFunction::LexicalScope ls(cgf, loc, block);

   mlir::Value fromArg = block->getArgument(0);
   mlir::Value toArg = block->getArgument(1);

   llvm::MutableArrayRef<mlir::BlockArgument> boundsRange =
       block->getArguments().drop_front(2);

   for (mlir::BlockArgument boundArg : llvm::reverse(boundsRange))
     std::tie(fromArg, toArg) =
         createBoundsLoop(fromArg, toArg, boundArg, loc, /*inverse=*/false);

   // Set up the 'to' address.
   mlir::Type elementTy =
       mlir::cast<cir::PointerType>(toArg.getType()).getPointee();
   CIRGenFunction::AutoVarEmission tempDeclEmission(*allocaDecl);
   tempDeclEmission.emittedAsOffload = true;
   tempDeclEmission.setAllocatedAddress(
       Address{toArg, elementTy, cgf.getContext().getDeclAlign(allocaDecl)});

   // Set up the 'from' address from the temporary.
   CIRGenFunction::DeclMapRevertingRAII declMapRAII{cgf, temporary};
   cgf.setAddrOfLocalVar(
       temporary,
       Address{fromArg, elementTy, cgf.getContext().getDeclAlign(allocaDecl)});
   cgf.emitAutoVarInit(tempDeclEmission);

   builder.setInsertionPointToEnd(&copyRegion.back());
   ls.forceCleanup();
   mlir::acc::YieldOp::create(builder, locEnd);
 }

 // This function generates the 'combiner' section for a reduction recipe. Note
 // that this function is not 'insertion point' clean, in that it alters the
 // insertion point to be inside of the 'combiner' section of the recipe, but
 // doesn't restore it aftewards.
 void OpenACCRecipeBuilderBase::createReductionRecipeCombiner(
     mlir::Location loc, mlir::Location locEnd, mlir::Value mainOp,
     mlir::acc::ReductionRecipeOp recipe, size_t numBounds, QualType origType,
     llvm::ArrayRef<OpenACCReductionRecipe::CombinerRecipe> combinerRecipes) {
   mlir::Block *block =
       createRecipeBlock(recipe.getCombinerRegion(), mainOp.getType(), loc,
                         numBounds, /*isInit=*/false);
   builder.setInsertionPointToEnd(&recipe.getCombinerRegion().back());
   CIRGenFunction::LexicalScope ls(cgf, loc, block);

   mlir::Value lhsArg = block->getArgument(0);
   mlir::Value rhsArg = block->getArgument(1);
   llvm::MutableArrayRef<mlir::BlockArgument> boundsRange =
       block->getArguments().drop_front(2);

   if (llvm::any_of(combinerRecipes, [](auto &r) { return r.Op == nullptr; })) {
     cgf.cgm.errorNYI(loc, "OpenACC Reduction combiner not generated");
     mlir::acc::YieldOp::create(builder, locEnd, block->getArgument(0));
     return;
   }

   // apply the bounds so that we can get our bounds emitted correctly.
   for (mlir::BlockArgument boundArg : llvm::reverse(boundsRange))
     std::tie(lhsArg, rhsArg) =
         createBoundsLoop(lhsArg, rhsArg, boundArg, loc, /*inverse=*/false);

   // Emitter for when we know this isn't a struct or array we have to loop
   // through. This should work for the 'field' once the get-element call has
   // been made.
   auto emitSingleCombiner =
       [&](mlir::Value lhsArg, mlir::Value rhsArg,
           const OpenACCReductionRecipe::CombinerRecipe &combiner) {
         mlir::Type elementTy =
             mlir::cast<cir::PointerType>(lhsArg.getType()).getPointee();
         CIRGenFunction::DeclMapRevertingRAII declMapRAIILhs{cgf, combiner.LHS};
         cgf.setAddrOfLocalVar(
             combiner.LHS, Address{lhsArg, elementTy,
                                   cgf.getContext().getDeclAlign(combiner.LHS)});
         CIRGenFunction::DeclMapRevertingRAII declMapRAIIRhs{cgf, combiner.RHS};
         cgf.setAddrOfLocalVar(
             combiner.RHS, Address{rhsArg, elementTy,
                                   cgf.getContext().getDeclAlign(combiner.RHS)});

         [[maybe_unused]] mlir::LogicalResult stmtRes =
             cgf.emitStmt(combiner.Op, /*useCurrentScope=*/true);
       };

   // Emitter for when we know this is either a non-array or element of an array
   // (which also shouldn't be an array type?). This function should generate the
   // initialization code for an entire 'array-element'/non-array, including
   // diving into each element of a struct (if necessary).
   auto emitCombiner = [&](mlir::Value lhsArg, mlir::Value rhsArg, QualType ty) {
     assert(!ty->isArrayType() && "Array type shouldn't get here");
     if (const auto *rd = ty->getAsRecordDecl()) {
       if (combinerRecipes.size() == 1 &&
           cgf.getContext().hasSameType(ty, combinerRecipes[0].LHS->getType())) {
         // If this is a 'top level' operator on the type we can just emit this
         // as a simple one.
         emitSingleCombiner(lhsArg, rhsArg, combinerRecipes[0]);
       } else {
         // else we have to handle each individual field after after a
         // get-element.
         const CIRGenRecordLayout &layout =
             cgf.cgm.getTypes().getCIRGenRecordLayout(rd);
         for (const auto &[field, combiner] :
              llvm::zip_equal(rd->fields(), combinerRecipes)) {
           mlir::Type fieldType = cgf.convertType(field->getType());
           auto fieldPtr = cir::PointerType::get(fieldType);
           unsigned fieldIndex = layout.getCIRFieldNo(field);

           mlir::Value lhsField = builder.createGetMember(
               loc, fieldPtr, lhsArg, field->getName(), fieldIndex);
           mlir::Value rhsField = builder.createGetMember(
               loc, fieldPtr, rhsArg, field->getName(), fieldIndex);

           emitSingleCombiner(lhsField, rhsField, combiner);
         }
       }

     } else {
       // if this is a single-thing (because we should know this isn't an array,
       // as Sema wouldn't let us get here), we can just do a normal emit call.
       emitSingleCombiner(lhsArg, rhsArg, combinerRecipes[0]);
     }
   };

   if (const auto *cat = cgf.getContext().getAsConstantArrayType(origType)) {
     // If we're in an array, we have to emit the combiner for each element of
     // the array.
     auto itrTy = mlir::cast<cir::IntType>(cgf.ptrDiffTy);
     auto itrPtrTy = cir::PointerType::get(itrTy);

     mlir::Value zero =
         builder.getConstInt(loc, mlir::cast<cir::IntType>(cgf.ptrDiffTy), 0);
     mlir::Value itr =
         cir::AllocaOp::create(builder, loc, itrPtrTy, itrTy, "itr",
                               cgf.cgm.getSize(cgf.getPointerAlign()));
     builder.CIRBaseBuilderTy::createStore(loc, zero, itr);

     builder.setInsertionPointAfter(builder.createFor(
         loc,
         /*condBuilder=*/
         [&](mlir::OpBuilder &b, mlir::Location loc) {
           auto loadItr = cir::LoadOp::create(builder, loc, {itr});
           mlir::Value arraySize = builder.getConstInt(
               loc, mlir::cast<cir::IntType>(cgf.ptrDiffTy), cat->getZExtSize());
           auto cmp = builder.createCompare(loc, cir::CmpOpKind::lt, loadItr,
                                            arraySize);
           builder.createCondition(cmp);
         },
         /*bodyBuilder=*/
         [&](mlir::OpBuilder &b, mlir::Location loc) {
           auto loadItr = cir::LoadOp::create(builder, loc, {itr});
           auto lhsElt = builder.getArrayElement(
               loc, loc, lhsArg, cgf.convertType(cat->getElementType()), loadItr,
               /*shouldDecay=*/true);
           auto rhsElt = builder.getArrayElement(
               loc, loc, rhsArg, cgf.convertType(cat->getElementType()), loadItr,
               /*shouldDecay=*/true);

           emitCombiner(lhsElt, rhsElt, cat->getElementType());
           builder.createYield(loc);
         },
         /*stepBuilder=*/
         [&](mlir::OpBuilder &b, mlir::Location loc) {
           auto loadItr = cir::LoadOp::create(builder, loc, {itr});
           auto inc = cir::UnaryOp::create(builder, loc, loadItr.getType(),
                                           cir::UnaryOpKind::Inc, loadItr);
           builder.CIRBaseBuilderTy::createStore(loc, inc, itr);
           builder.createYield(loc);
         }));

   } else if (origType->isArrayType()) {
     cgf.cgm.errorNYI(loc,
                      "OpenACC Reduction combiner non-constant array recipe");
   } else {
     emitCombiner(lhsArg, rhsArg, origType);
   }

   builder.setInsertionPointToEnd(&recipe.getCombinerRegion().back());
   ls.forceCleanup();
   mlir::acc::YieldOp::create(builder, locEnd, block->getArgument(0));
 }

 } // namespace clang::CIRGen