mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp - llvm-project - Git at Google

 //===- GPUOpsLowering.cpp - GPU FuncOp / ReturnOp lowering ----------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "GPUOpsLowering.h"

 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "llvm/ADT/SmallVectorExtras.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/FormatVariadic.h"

 using namespace mlir;

 LogicalResult
 GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
                                    ConversionPatternRewriter &rewriter) const {
   Location loc = gpuFuncOp.getLoc();

   SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
   if (encodeWorkgroupAttributionsAsArguments) {
     // Append an `llvm.ptr` argument to the function signature to encode
     // workgroup attributions.

     ArrayRef<BlockArgument> workgroupAttributions =
         gpuFuncOp.getWorkgroupAttributions();
     size_t numAttributions = workgroupAttributions.size();

     // Insert all arguments at the end.
     unsigned index = gpuFuncOp.getNumArguments();
     SmallVector<unsigned> argIndices(numAttributions, index);

     // New arguments will simply be `llvm.ptr` with the correct address space
     Type workgroupPtrType =
         rewriter.getType<LLVM::LLVMPointerType>(workgroupAddrSpace);
     SmallVector<Type> argTypes(numAttributions, workgroupPtrType);

     // Attributes: noalias, llvm.mlir.workgroup_attribution(<size>, <type>)
     std::array attrs{
         rewriter.getNamedAttr(LLVM::LLVMDialect::getNoAliasAttrName(),
                               rewriter.getUnitAttr()),
         rewriter.getNamedAttr(
             getDialect().getWorkgroupAttributionAttrHelper().getName(),
             rewriter.getUnitAttr()),
     };
     SmallVector<DictionaryAttr> argAttrs;
     for (BlockArgument attribution : workgroupAttributions) {
       auto attributionType = cast<MemRefType>(attribution.getType());
       IntegerAttr numElements =
           rewriter.getI64IntegerAttr(attributionType.getNumElements());
       Type llvmElementType =
           getTypeConverter()->convertType(attributionType.getElementType());
       if (!llvmElementType)
         return failure();
       TypeAttr type = TypeAttr::get(llvmElementType);
       attrs.back().setValue(
           rewriter.getAttr<LLVM::WorkgroupAttributionAttr>(numElements, type));
       argAttrs.push_back(rewriter.getDictionaryAttr(attrs));
     }

     // Location match function location
     SmallVector<Location> argLocs(numAttributions, gpuFuncOp.getLoc());

     // Perform signature modification
     rewriter.modifyOpInPlace(
         gpuFuncOp, [gpuFuncOp, &argIndices, &argTypes, &argAttrs, &argLocs]() {
           static_cast<FunctionOpInterface>(gpuFuncOp).insertArguments(
               argIndices, argTypes, argAttrs, argLocs);
         });
   } else {
     workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
     for (auto [idx, attribution] :
          llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
       auto type = dyn_cast<MemRefType>(attribution.getType());
       assert(type && type.hasStaticShape() && "unexpected type in attribution");

       uint64_t numElements = type.getNumElements();

       auto elementType =
           cast<Type>(typeConverter->convertType(type.getElementType()));
       auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
       std::string name =
           std::string(llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), idx));
       uint64_t alignment = 0;
       if (auto alignAttr = dyn_cast_or_null<IntegerAttr>(
               gpuFuncOp.getWorkgroupAttributionAttr(
                   idx, LLVM::LLVMDialect::getAlignAttrName())))
         alignment = alignAttr.getInt();
       auto globalOp = rewriter.create<LLVM::GlobalOp>(
           gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
           LLVM::Linkage::Internal, name, /*value=*/Attribute(), alignment,
           workgroupAddrSpace);
       workgroupBuffers.push_back(globalOp);
     }
   }

   // Remap proper input types.
   TypeConverter::SignatureConversion signatureConversion(
       gpuFuncOp.front().getNumArguments());

   Type funcType = getTypeConverter()->convertFunctionSignature(
       gpuFuncOp.getFunctionType(), /*isVariadic=*/false,
       getTypeConverter()->getOptions().useBarePtrCallConv, signatureConversion);
   if (!funcType) {
     return rewriter.notifyMatchFailure(gpuFuncOp, [&](Diagnostic &diag) {
       diag << "failed to convert function signature type for: "
            << gpuFuncOp.getFunctionType();
     });
   }

   // Create the new function operation. Only copy those attributes that are
   // not specific to function modeling.
   SmallVector<NamedAttribute, 4> attributes;
   ArrayAttr argAttrs;
   for (const auto &attr : gpuFuncOp->getAttrs()) {
     if (attr.getName() == SymbolTable::getSymbolAttrName() ||
         attr.getName() == gpuFuncOp.getFunctionTypeAttrName() ||
         attr.getName() ==
             gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName() ||
         attr.getName() == gpuFuncOp.getWorkgroupAttribAttrsAttrName() ||
         attr.getName() == gpuFuncOp.getPrivateAttribAttrsAttrName() ||
         attr.getName() == gpuFuncOp.getKnownBlockSizeAttrName() ||
         attr.getName() == gpuFuncOp.getKnownGridSizeAttrName())
       continue;
     if (attr.getName() == gpuFuncOp.getArgAttrsAttrName()) {
       argAttrs = gpuFuncOp.getArgAttrsAttr();
       continue;
     }
     attributes.push_back(attr);
   }

   DenseI32ArrayAttr knownBlockSize = gpuFuncOp.getKnownBlockSizeAttr();
   DenseI32ArrayAttr knownGridSize = gpuFuncOp.getKnownGridSizeAttr();
   // Ensure we don't lose information if the function is lowered before its
   // surrounding context.
   auto *gpuDialect = cast<gpu::GPUDialect>(gpuFuncOp->getDialect());
   if (knownBlockSize)
     attributes.emplace_back(gpuDialect->getKnownBlockSizeAttrHelper().getName(),
                             knownBlockSize);
   if (knownGridSize)
     attributes.emplace_back(gpuDialect->getKnownGridSizeAttrHelper().getName(),
                             knownGridSize);

   // Add a dialect specific kernel attribute in addition to GPU kernel
   // attribute. The former is necessary for further translation while the
   // latter is expected by gpu.launch_func.
   if (gpuFuncOp.isKernel()) {
     if (kernelAttributeName)
       attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
     // Set the dialect-specific block size attribute if there is one.
     if (kernelBlockSizeAttributeName && knownBlockSize) {
       attributes.emplace_back(kernelBlockSizeAttributeName, knownBlockSize);
     }
   }
   LLVM::CConv callingConvention = gpuFuncOp.isKernel()
                                       ? kernelCallingConvention
                                       : nonKernelCallingConvention;
   auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
       gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
       LLVM::Linkage::External, /*dsoLocal=*/false, callingConvention,
       /*comdat=*/nullptr, attributes);

   {
     // Insert operations that correspond to converted workgroup and private
     // memory attributions to the body of the function. This must operate on
     // the original function, before the body region is inlined in the new
     // function to maintain the relation between block arguments and the
     // parent operation that assigns their semantics.
     OpBuilder::InsertionGuard guard(rewriter);

     // Rewrite workgroup memory attributions to addresses of global buffers.
     rewriter.setInsertionPointToStart(&gpuFuncOp.front());
     unsigned numProperArguments = gpuFuncOp.getNumArguments();

     if (encodeWorkgroupAttributionsAsArguments) {
       // Build a MemRefDescriptor with each of the arguments added above.

       unsigned numAttributions = gpuFuncOp.getNumWorkgroupAttributions();
       assert(numProperArguments >= numAttributions &&
              "Expecting attributions to be encoded as arguments already");

       // Arguments encoding workgroup attributions will be in positions
       // [numProperArguments, numProperArguments+numAttributions)
       ArrayRef<BlockArgument> attributionArguments =
           gpuFuncOp.getArguments().slice(numProperArguments - numAttributions,
                                          numAttributions);
       for (auto [idx, vals] : llvm::enumerate(llvm::zip_equal(
                gpuFuncOp.getWorkgroupAttributions(), attributionArguments))) {
         auto [attribution, arg] = vals;
         auto type = cast<MemRefType>(attribution.getType());

         // Arguments are of llvm.ptr type and attributions are of memref type:
         // we need to wrap them in memref descriptors.
         Value descr = MemRefDescriptor::fromStaticShape(
             rewriter, loc, *getTypeConverter(), type, arg);

         // And remap the arguments
         signatureConversion.remapInput(numProperArguments + idx, descr);
       }
     } else {
       for (const auto [idx, global] : llvm::enumerate(workgroupBuffers)) {
         auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext(),
                                                   global.getAddrSpace());
         Value address = rewriter.create<LLVM::AddressOfOp>(
             loc, ptrType, global.getSymNameAttr());
         Value memory =
             rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getType(),
                                          address, ArrayRef<LLVM::GEPArg>{0, 0});

         // Build a memref descriptor pointing to the buffer to plug with the
         // existing memref infrastructure. This may use more registers than
         // otherwise necessary given that memref sizes are fixed, but we can try
         // and canonicalize that away later.
         Value attribution = gpuFuncOp.getWorkgroupAttributions()[idx];
         auto type = cast<MemRefType>(attribution.getType());
         auto descr = MemRefDescriptor::fromStaticShape(
             rewriter, loc, *getTypeConverter(), type, memory);
         signatureConversion.remapInput(numProperArguments + idx, descr);
       }
     }

     // Rewrite private memory attributions to alloca'ed buffers.
     unsigned numWorkgroupAttributions = gpuFuncOp.getNumWorkgroupAttributions();
     auto int64Ty = IntegerType::get(rewriter.getContext(), 64);
     for (const auto [idx, attribution] :
          llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
       auto type = cast<MemRefType>(attribution.getType());
       assert(type && type.hasStaticShape() && "unexpected type in attribution");

       // Explicitly drop memory space when lowering private memory
       // attributions since NVVM models it as `alloca`s in the default
       // memory space and does not support `alloca`s with addrspace(5).
       Type elementType = typeConverter->convertType(type.getElementType());
       auto ptrType =
           LLVM::LLVMPointerType::get(rewriter.getContext(), allocaAddrSpace);
       Value numElements = rewriter.create<LLVM::ConstantOp>(
           gpuFuncOp.getLoc(), int64Ty, type.getNumElements());
       uint64_t alignment = 0;
       if (auto alignAttr =
               dyn_cast_or_null<IntegerAttr>(gpuFuncOp.getPrivateAttributionAttr(
                   idx, LLVM::LLVMDialect::getAlignAttrName())))
         alignment = alignAttr.getInt();
       Value allocated = rewriter.create<LLVM::AllocaOp>(
           gpuFuncOp.getLoc(), ptrType, elementType, numElements, alignment);
       auto descr = MemRefDescriptor::fromStaticShape(
           rewriter, loc, *getTypeConverter(), type, allocated);
       signatureConversion.remapInput(
           numProperArguments + numWorkgroupAttributions + idx, descr);
     }
   }

   // Move the region to the new function, update the entry block signature.
   rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
                               llvmFuncOp.end());
   if (failed(rewriter.convertRegionTypes(&llvmFuncOp.getBody(), *typeConverter,
                                          &signatureConversion)))
     return failure();

   // Get memref type from function arguments and set the noalias to
   // pointer arguments.
   for (const auto [idx, argTy] :
        llvm::enumerate(gpuFuncOp.getArgumentTypes())) {
     auto remapping = signatureConversion.getInputMapping(idx);
     NamedAttrList argAttr =
         argAttrs ? cast<DictionaryAttr>(argAttrs[idx]) : NamedAttrList();
     auto copyAttribute = [&](StringRef attrName) {
       Attribute attr = argAttr.erase(attrName);
       if (!attr)
         return;
       for (size_t i = 0, e = remapping->size; i < e; ++i)
         llvmFuncOp.setArgAttr(remapping->inputNo + i, attrName, attr);
     };
     auto copyPointerAttribute = [&](StringRef attrName) {
       Attribute attr = argAttr.erase(attrName);

       if (!attr)
         return;
       if (remapping->size > 1 &&
           attrName == LLVM::LLVMDialect::getNoAliasAttrName()) {
         emitWarning(llvmFuncOp.getLoc(),
                     "Cannot copy noalias with non-bare pointers.\n");
         return;
       }
       for (size_t i = 0, e = remapping->size; i < e; ++i) {
         if (isa<LLVM::LLVMPointerType>(
                 llvmFuncOp.getArgument(remapping->inputNo + i).getType())) {
           llvmFuncOp.setArgAttr(remapping->inputNo + i, attrName, attr);
         }
       }
     };

     if (argAttr.empty())
       continue;

     copyAttribute(LLVM::LLVMDialect::getReturnedAttrName());
     copyAttribute(LLVM::LLVMDialect::getNoUndefAttrName());
     copyAttribute(LLVM::LLVMDialect::getInRegAttrName());
     bool lowersToPointer = false;
     for (size_t i = 0, e = remapping->size; i < e; ++i) {
       lowersToPointer |= isa<LLVM::LLVMPointerType>(
           llvmFuncOp.getArgument(remapping->inputNo + i).getType());
     }

     if (lowersToPointer) {
       copyPointerAttribute(LLVM::LLVMDialect::getNoAliasAttrName());
       copyPointerAttribute(LLVM::LLVMDialect::getNoCaptureAttrName());
       copyPointerAttribute(LLVM::LLVMDialect::getNoFreeAttrName());
       copyPointerAttribute(LLVM::LLVMDialect::getAlignAttrName());
       copyPointerAttribute(LLVM::LLVMDialect::getReadonlyAttrName());
       copyPointerAttribute(LLVM::LLVMDialect::getWriteOnlyAttrName());
       copyPointerAttribute(LLVM::LLVMDialect::getReadnoneAttrName());
       copyPointerAttribute(LLVM::LLVMDialect::getNonNullAttrName());
       copyPointerAttribute(LLVM::LLVMDialect::getDereferenceableAttrName());
       copyPointerAttribute(
           LLVM::LLVMDialect::getDereferenceableOrNullAttrName());
       copyPointerAttribute(
           LLVM::LLVMDialect::WorkgroupAttributionAttrHelper::getNameStr());
     }
   }
   rewriter.eraseOp(gpuFuncOp);
   return success();
 }

 static SmallString<16> getUniqueFormatGlobalName(gpu::GPUModuleOp moduleOp) {
   const char formatStringPrefix[] = "printfFormat_";
   // Get a unique global name.
   unsigned stringNumber = 0;
   SmallString<16> stringConstName;
   do {
     stringConstName.clear();
     (formatStringPrefix + Twine(stringNumber++)).toStringRef(stringConstName);
   } while (moduleOp.lookupSymbol(stringConstName));
   return stringConstName;
 }

 /// Create an global that contains the given format string. If a global with
 /// the same format string exists already in the module, return that global.
 static LLVM::GlobalOp getOrCreateFormatStringConstant(
     OpBuilder &b, Location loc, gpu::GPUModuleOp moduleOp, Type llvmI8,
     StringRef str, uint64_t alignment = 0, unsigned addrSpace = 0) {
   llvm::SmallString<20> formatString(str);
   formatString.push_back('\0'); // Null terminate for C
   auto globalType =
       LLVM::LLVMArrayType::get(llvmI8, formatString.size_in_bytes());
   StringAttr attr = b.getStringAttr(formatString);

   // Try to find existing global.
   for (auto globalOp : moduleOp.getOps<LLVM::GlobalOp>())
     if (globalOp.getGlobalType() == globalType && globalOp.getConstant() &&
         globalOp.getValueAttr() == attr &&
         globalOp.getAlignment().value_or(0) == alignment &&
         globalOp.getAddrSpace() == addrSpace)
       return globalOp;

   // Not found: create new global.
   OpBuilder::InsertionGuard guard(b);
   b.setInsertionPointToStart(moduleOp.getBody());
   SmallString<16> name = getUniqueFormatGlobalName(moduleOp);
   return b.create<LLVM::GlobalOp>(loc, globalType,
                                   /*isConstant=*/true, LLVM::Linkage::Internal,
                                   name, attr, alignment, addrSpace);
 }

 template <typename T>
 static LLVM::LLVMFuncOp getOrDefineFunction(T &moduleOp, const Location loc,
                                             ConversionPatternRewriter &rewriter,
                                             StringRef name,
                                             LLVM::LLVMFunctionType type) {
   LLVM::LLVMFuncOp ret;
   if (!(ret = moduleOp.template lookupSymbol<LLVM::LLVMFuncOp>(name))) {
     ConversionPatternRewriter::InsertionGuard guard(rewriter);
     rewriter.setInsertionPointToStart(moduleOp.getBody());
     ret = rewriter.create<LLVM::LLVMFuncOp>(loc, name, type,
                                             LLVM::Linkage::External);
   }
   return ret;
 }

 LogicalResult GPUPrintfOpToHIPLowering::matchAndRewrite(
     gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
   Location loc = gpuPrintfOp->getLoc();

   mlir::Type llvmI8 = typeConverter->convertType(rewriter.getI8Type());
   auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
   mlir::Type llvmI32 = typeConverter->convertType(rewriter.getI32Type());
   mlir::Type llvmI64 = typeConverter->convertType(rewriter.getI64Type());
   // Note: this is the GPUModule op, not the ModuleOp that surrounds it
   // This ensures that global constants and declarations are placed within
   // the device code, not the host code
   auto moduleOp = gpuPrintfOp->getParentOfType<gpu::GPUModuleOp>();

   auto ocklBegin =
       getOrDefineFunction(moduleOp, loc, rewriter, "__ockl_printf_begin",
                           LLVM::LLVMFunctionType::get(llvmI64, {llvmI64}));
   LLVM::LLVMFuncOp ocklAppendArgs;
   if (!adaptor.getArgs().empty()) {
     ocklAppendArgs = getOrDefineFunction(
         moduleOp, loc, rewriter, "__ockl_printf_append_args",
         LLVM::LLVMFunctionType::get(
             llvmI64, {llvmI64, /*numArgs*/ llvmI32, llvmI64, llvmI64, llvmI64,
                       llvmI64, llvmI64, llvmI64, llvmI64, /*isLast*/ llvmI32}));
   }
   auto ocklAppendStringN = getOrDefineFunction(
       moduleOp, loc, rewriter, "__ockl_printf_append_string_n",
       LLVM::LLVMFunctionType::get(
           llvmI64,
           {llvmI64, ptrType, /*length (bytes)*/ llvmI64, /*isLast*/ llvmI32}));

   /// Start the printf hostcall
   Value zeroI64 = rewriter.create<LLVM::ConstantOp>(loc, llvmI64, 0);
   auto printfBeginCall = rewriter.create<LLVM::CallOp>(loc, ocklBegin, zeroI64);
   Value printfDesc = printfBeginCall.getResult();

   // Create the global op or find an existing one.
   LLVM::GlobalOp global = getOrCreateFormatStringConstant(
       rewriter, loc, moduleOp, llvmI8, adaptor.getFormat());

   // Get a pointer to the format string's first element and pass it to printf()
   Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
       loc,
       LLVM::LLVMPointerType::get(rewriter.getContext(), global.getAddrSpace()),
       global.getSymNameAttr());
   Value stringStart =
       rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getGlobalType(),
                                    globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
   Value stringLen = rewriter.create<LLVM::ConstantOp>(
       loc, llvmI64, cast<StringAttr>(global.getValueAttr()).size());

   Value oneI32 = rewriter.create<LLVM::ConstantOp>(loc, llvmI32, 1);
   Value zeroI32 = rewriter.create<LLVM::ConstantOp>(loc, llvmI32, 0);

   auto appendFormatCall = rewriter.create<LLVM::CallOp>(
       loc, ocklAppendStringN,
       ValueRange{printfDesc, stringStart, stringLen,
                  adaptor.getArgs().empty() ? oneI32 : zeroI32});
   printfDesc = appendFormatCall.getResult();

   // __ockl_printf_append_args takes 7 values per append call
   constexpr size_t argsPerAppend = 7;
   size_t nArgs = adaptor.getArgs().size();
   for (size_t group = 0; group < nArgs; group += argsPerAppend) {
     size_t bound = std::min(group + argsPerAppend, nArgs);
     size_t numArgsThisCall = bound - group;

     SmallVector<mlir::Value, 2 + argsPerAppend + 1> arguments;
     arguments.push_back(printfDesc);
     arguments.push_back(
         rewriter.create<LLVM::ConstantOp>(loc, llvmI32, numArgsThisCall));
     for (size_t i = group; i < bound; ++i) {
       Value arg = adaptor.getArgs()[i];
       if (auto floatType = dyn_cast<FloatType>(arg.getType())) {
         if (!floatType.isF64())
           arg = rewriter.create<LLVM::FPExtOp>(
               loc, typeConverter->convertType(rewriter.getF64Type()), arg);
         arg = rewriter.create<LLVM::BitcastOp>(loc, llvmI64, arg);
       }
       if (arg.getType().getIntOrFloatBitWidth() != 64)
         arg = rewriter.create<LLVM::ZExtOp>(loc, llvmI64, arg);

       arguments.push_back(arg);
     }
     // Pad out to 7 arguments since the hostcall always needs 7
     for (size_t extra = numArgsThisCall; extra < argsPerAppend; ++extra) {
       arguments.push_back(zeroI64);
     }

     auto isLast = (bound == nArgs) ? oneI32 : zeroI32;
     arguments.push_back(isLast);
     auto call = rewriter.create<LLVM::CallOp>(loc, ocklAppendArgs, arguments);
     printfDesc = call.getResult();
   }
   rewriter.eraseOp(gpuPrintfOp);
   return success();
 }

 LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite(
     gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
   Location loc = gpuPrintfOp->getLoc();

   mlir::Type llvmI8 = typeConverter->convertType(rewriter.getIntegerType(8));
   mlir::Type ptrType =
       LLVM::LLVMPointerType::get(rewriter.getContext(), addressSpace);

   // Note: this is the GPUModule op, not the ModuleOp that surrounds it
   // This ensures that global constants and declarations are placed within
   // the device code, not the host code
   auto moduleOp = gpuPrintfOp->getParentOfType<gpu::GPUModuleOp>();

   auto printfType =
       LLVM::LLVMFunctionType::get(rewriter.getI32Type(), {ptrType},
                                   /*isVarArg=*/true);
   LLVM::LLVMFuncOp printfDecl =
       getOrDefineFunction(moduleOp, loc, rewriter, "printf", printfType);

   // Create the global op or find an existing one.
   LLVM::GlobalOp global = getOrCreateFormatStringConstant(
       rewriter, loc, moduleOp, llvmI8, adaptor.getFormat(), /*alignment=*/0,
       addressSpace);

   // Get a pointer to the format string's first element
   Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
       loc,
       LLVM::LLVMPointerType::get(rewriter.getContext(), global.getAddrSpace()),
       global.getSymNameAttr());
   Value stringStart =
       rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getGlobalType(),
                                    globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});

   // Construct arguments and function call
   auto argsRange = adaptor.getArgs();
   SmallVector<Value, 4> printfArgs;
   printfArgs.reserve(argsRange.size() + 1);
   printfArgs.push_back(stringStart);
   printfArgs.append(argsRange.begin(), argsRange.end());

   rewriter.create<LLVM::CallOp>(loc, printfDecl, printfArgs);
   rewriter.eraseOp(gpuPrintfOp);
   return success();
 }

 LogicalResult GPUPrintfOpToVPrintfLowering::matchAndRewrite(
     gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
   Location loc = gpuPrintfOp->getLoc();

   mlir::Type llvmI8 = typeConverter->convertType(rewriter.getIntegerType(8));
   mlir::Type ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());

   // Note: this is the GPUModule op, not the ModuleOp that surrounds it
   // This ensures that global constants and declarations are placed within
   // the device code, not the host code
   auto moduleOp = gpuPrintfOp->getParentOfType<gpu::GPUModuleOp>();

   auto vprintfType =
       LLVM::LLVMFunctionType::get(rewriter.getI32Type(), {ptrType, ptrType});
   LLVM::LLVMFuncOp vprintfDecl =
       getOrDefineFunction(moduleOp, loc, rewriter, "vprintf", vprintfType);

   // Create the global op or find an existing one.
   LLVM::GlobalOp global = getOrCreateFormatStringConstant(
       rewriter, loc, moduleOp, llvmI8, adaptor.getFormat());

   // Get a pointer to the format string's first element
   Value globalPtr = rewriter.create<LLVM::AddressOfOp>(loc, global);
   Value stringStart =
       rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getGlobalType(),
                                    globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
   SmallVector<Type> types;
   SmallVector<Value> args;
   // Promote and pack the arguments into a stack allocation.
   for (Value arg : adaptor.getArgs()) {
     Type type = arg.getType();
     Value promotedArg = arg;
     assert(type.isIntOrFloat());
     if (isa<FloatType>(type)) {
       type = rewriter.getF64Type();
       promotedArg = rewriter.create<LLVM::FPExtOp>(loc, type, arg);
     }
     types.push_back(type);
     args.push_back(promotedArg);
   }
   Type structType =
       LLVM::LLVMStructType::getLiteral(gpuPrintfOp.getContext(), types);
   Value one = rewriter.create<LLVM::ConstantOp>(loc, rewriter.getI64Type(),
                                                 rewriter.getIndexAttr(1));
   Value tempAlloc =
       rewriter.create<LLVM::AllocaOp>(loc, ptrType, structType, one,
                                       /*alignment=*/0);
   for (auto [index, arg] : llvm::enumerate(args)) {
     Value ptr = rewriter.create<LLVM::GEPOp>(
         loc, ptrType, structType, tempAlloc,
         ArrayRef<LLVM::GEPArg>{0, static_cast<int32_t>(index)});
     rewriter.create<LLVM::StoreOp>(loc, arg, ptr);
   }
   std::array<Value, 2> printfArgs = {stringStart, tempAlloc};

   rewriter.create<LLVM::CallOp>(loc, vprintfDecl, printfArgs);
   rewriter.eraseOp(gpuPrintfOp);
   return success();
 }

 /// Unrolls op if it's operating on vectors.
 LogicalResult impl::scalarizeVectorOp(Operation *op, ValueRange operands,
                                       ConversionPatternRewriter &rewriter,
                                       const LLVMTypeConverter &converter) {
   TypeRange operandTypes(operands);
   if (llvm::none_of(operandTypes, llvm::IsaPred<VectorType>)) {
     return rewriter.notifyMatchFailure(op, "expected vector operand");
   }
   if (op->getNumRegions() != 0 || op->getNumSuccessors() != 0)
     return rewriter.notifyMatchFailure(op, "expected no region/successor");
   if (op->getNumResults() != 1)
     return rewriter.notifyMatchFailure(op, "expected single result");
   VectorType vectorType = dyn_cast<VectorType>(op->getResult(0).getType());
   if (!vectorType)
     return rewriter.notifyMatchFailure(op, "expected vector result");

   Location loc = op->getLoc();
   Value result = rewriter.create<LLVM::UndefOp>(loc, vectorType);
   Type indexType = converter.convertType(rewriter.getIndexType());
   StringAttr name = op->getName().getIdentifier();
   Type elementType = vectorType.getElementType();

   for (int64_t i = 0; i < vectorType.getNumElements(); ++i) {
     Value index = rewriter.create<LLVM::ConstantOp>(loc, indexType, i);
     auto extractElement = [&](Value operand) -> Value {
       if (!isa<VectorType>(operand.getType()))
         return operand;
       return rewriter.create<LLVM::ExtractElementOp>(loc, operand, index);
     };
     auto scalarOperands = llvm::map_to_vector(operands, extractElement);
     Operation *scalarOp =
         rewriter.create(loc, name, scalarOperands, elementType, op->getAttrs());
     result = rewriter.create<LLVM::InsertElementOp>(
         loc, result, scalarOp->getResult(0), index);
   }

   rewriter.replaceOp(op, result);
   return success();
 }

 static IntegerAttr wrapNumericMemorySpace(MLIRContext *ctx, unsigned space) {
   return IntegerAttr::get(IntegerType::get(ctx, 64), space);
 }

 /// Generates a symbol with 0-sized array type for dynamic shared memory usage,
 /// or uses existing symbol.
 LLVM::GlobalOp getDynamicSharedMemorySymbol(
     ConversionPatternRewriter &rewriter, gpu::GPUModuleOp moduleOp,
     gpu::DynamicSharedMemoryOp op, const LLVMTypeConverter *typeConverter,
     MemRefType memrefType, unsigned alignmentBit) {
   uint64_t alignmentByte = alignmentBit / memrefType.getElementTypeBitWidth();

   FailureOr<unsigned> addressSpace =
       typeConverter->getMemRefAddressSpace(memrefType);
   if (failed(addressSpace)) {
     op->emitError() << "conversion of memref memory space "
                     << memrefType.getMemorySpace()
                     << " to integer address space "
                        "failed. Consider adding memory space conversions.";
   }

   // Step 1. Collect symbol names of LLVM::GlobalOp Ops. Also if any of
   // LLVM::GlobalOp is suitable for shared memory, return it.
   llvm::StringSet<> existingGlobalNames;
   for (auto globalOp : moduleOp.getBody()->getOps<LLVM::GlobalOp>()) {
     existingGlobalNames.insert(globalOp.getSymName());
     if (auto arrayType = dyn_cast<LLVM::LLVMArrayType>(globalOp.getType())) {
       if (globalOp.getAddrSpace() == addressSpace.value() &&
           arrayType.getNumElements() == 0 &&
           globalOp.getAlignment().value_or(0) == alignmentByte) {
         return globalOp;
       }
     }
   }

   // Step 2. Find a unique symbol name
   unsigned uniquingCounter = 0;
   SmallString<128> symName = SymbolTable::generateSymbolName<128>(
       "__dynamic_shmem_",
       [&](StringRef candidate) {
         return existingGlobalNames.contains(candidate);
       },
       uniquingCounter);

   // Step 3. Generate a global op
   OpBuilder::InsertionGuard guard(rewriter);
   rewriter.setInsertionPointToStart(moduleOp.getBody());

   auto zeroSizedArrayType = LLVM::LLVMArrayType::get(
       typeConverter->convertType(memrefType.getElementType()), 0);

   return rewriter.create<LLVM::GlobalOp>(
       op->getLoc(), zeroSizedArrayType, /*isConstant=*/false,
       LLVM::Linkage::Internal, symName, /*value=*/Attribute(), alignmentByte,
       addressSpace.value());
 }

 LogicalResult GPUDynamicSharedMemoryOpLowering::matchAndRewrite(
     gpu::DynamicSharedMemoryOp op, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
   Location loc = op.getLoc();
   MemRefType memrefType = op.getResultMemref().getType();
   Type elementType = typeConverter->convertType(memrefType.getElementType());

   // Step 1: Generate a memref<0xi8> type
   MemRefLayoutAttrInterface layout = {};
   auto memrefType0sz =
       MemRefType::get({0}, elementType, layout, memrefType.getMemorySpace());

   // Step 2: Generate a global symbol or existing for the dynamic shared
   // memory with memref<0xi8> type
   auto moduleOp = op->getParentOfType<gpu::GPUModuleOp>();
   LLVM::GlobalOp shmemOp = getDynamicSharedMemorySymbol(
       rewriter, moduleOp, op, getTypeConverter(), memrefType0sz, alignmentBit);

   // Step 3. Get address of the global symbol
   OpBuilder::InsertionGuard guard(rewriter);
   rewriter.setInsertionPoint(op);
   auto basePtr = rewriter.create<LLVM::AddressOfOp>(loc, shmemOp);
   Type baseType = basePtr->getResultTypes().front();

   // Step 4. Generate GEP using offsets
   SmallVector<LLVM::GEPArg> gepArgs = {0};
   Value shmemPtr = rewriter.create<LLVM::GEPOp>(loc, baseType, elementType,
                                                 basePtr, gepArgs);
   // Step 5. Create a memref descriptor
   SmallVector<Value> shape, strides;
   Value sizeBytes;
   getMemRefDescriptorSizes(loc, memrefType0sz, {}, rewriter, shape, strides,
                            sizeBytes);
   auto memRefDescriptor = this->createMemRefDescriptor(
       loc, memrefType0sz, shmemPtr, shmemPtr, shape, strides, rewriter);

   // Step 5. Replace the op with memref descriptor
   rewriter.replaceOp(op, {memRefDescriptor});
   return success();
 }

 LogicalResult GPUReturnOpLowering::matchAndRewrite(
     gpu::ReturnOp op, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
   Location loc = op.getLoc();
   unsigned numArguments = op.getNumOperands();
   SmallVector<Value, 4> updatedOperands;

   bool useBarePtrCallConv = getTypeConverter()->getOptions().useBarePtrCallConv;
   if (useBarePtrCallConv) {
     // For the bare-ptr calling convention, extract the aligned pointer to
     // be returned from the memref descriptor.
     for (auto it : llvm::zip(op->getOperands(), adaptor.getOperands())) {
       Type oldTy = std::get<0>(it).getType();
       Value newOperand = std::get<1>(it);
       if (isa<MemRefType>(oldTy) && getTypeConverter()->canConvertToBarePtr(
                                         cast<BaseMemRefType>(oldTy))) {
         MemRefDescriptor memrefDesc(newOperand);
         newOperand = memrefDesc.allocatedPtr(rewriter, loc);
       } else if (isa<UnrankedMemRefType>(oldTy)) {
         // Unranked memref is not supported in the bare pointer calling
         // convention.
         return failure();
       }
       updatedOperands.push_back(newOperand);
     }
   } else {
     updatedOperands = llvm::to_vector<4>(adaptor.getOperands());
     (void)copyUnrankedDescriptors(rewriter, loc, op.getOperands().getTypes(),
                                   updatedOperands,
                                   /*toDynamic=*/true);
   }

   // If ReturnOp has 0 or 1 operand, create it and return immediately.
   if (numArguments <= 1) {
     rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(
         op, TypeRange(), updatedOperands, op->getAttrs());
     return success();
   }

   // Otherwise, we need to pack the arguments into an LLVM struct type before
   // returning.
   auto packedType = getTypeConverter()->packFunctionResults(
       op.getOperandTypes(), useBarePtrCallConv);
   if (!packedType) {
     return rewriter.notifyMatchFailure(op, "could not convert result types");
   }

   Value packed = rewriter.create<LLVM::UndefOp>(loc, packedType);
   for (auto [idx, operand] : llvm::enumerate(updatedOperands)) {
     packed = rewriter.create<LLVM::InsertValueOp>(loc, packed, operand, idx);
   }
   rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, TypeRange(), packed,
                                               op->getAttrs());
   return success();
 }

 void mlir::populateGpuMemorySpaceAttributeConversions(
     TypeConverter &typeConverter, const MemorySpaceMapping &mapping) {
   typeConverter.addTypeAttributeConversion(
       [mapping](BaseMemRefType type, gpu::AddressSpaceAttr memorySpaceAttr) {
         gpu::AddressSpace memorySpace = memorySpaceAttr.getValue();
         unsigned addressSpace = mapping(memorySpace);
         return wrapNumericMemorySpace(memorySpaceAttr.getContext(),
                                       addressSpace);
       });
 }