mlir/lib/Target/LLVM/ROCDL/Target.cpp - llvm-project - Git at Google

 //===- Target.cpp - MLIR LLVM ROCDL target compilation ----------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This files defines ROCDL target related functions including registration
 // calls for the `#rocdl.target` compilation attribute.
 //
 //===----------------------------------------------------------------------===//

 #include "mlir/Target/LLVM/ROCDL/Target.h"

 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Support/FileUtilities.h"
 #include "mlir/Target/LLVM/ROCDL/Utils.h"
 #include "mlir/Target/LLVMIR/Export.h"

 #include "llvm/IR/Constants.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/TargetParser/TargetParser.h"

 #include <cstdlib>
 #include <optional>

 using namespace mlir;
 using namespace mlir::ROCDL;

 #ifndef __DEFAULT_ROCM_PATH__
 #define __DEFAULT_ROCM_PATH__ ""
 #endif

 namespace {
 // Implementation of the `TargetAttrInterface` model.
 class ROCDLTargetAttrImpl
     : public gpu::TargetAttrInterface::FallbackModel<ROCDLTargetAttrImpl> {
 public:
   std::optional<SmallVector<char, 0>>
   serializeToObject(Attribute attribute, Operation *module,
                     const gpu::TargetOptions &options) const;

   Attribute createObject(Attribute attribute, Operation *module,
                          const SmallVector<char, 0> &object,
                          const gpu::TargetOptions &options) const;
 };
 } // namespace

 // Register the ROCDL dialect, the ROCDL translation and the target interface.
 void mlir::ROCDL::registerROCDLTargetInterfaceExternalModels(
     DialectRegistry &registry) {
   registry.addExtension(+[](MLIRContext *ctx, ROCDL::ROCDLDialect *dialect) {
     ROCDLTargetAttr::attachInterface<ROCDLTargetAttrImpl>(*ctx);
   });
 }

 void mlir::ROCDL::registerROCDLTargetInterfaceExternalModels(
     MLIRContext &context) {
   DialectRegistry registry;
   registerROCDLTargetInterfaceExternalModels(registry);
   context.appendDialectRegistry(registry);
 }

 // Search for the ROCM path.
 StringRef mlir::ROCDL::getROCMPath() {
   if (const char *var = std::getenv("ROCM_PATH"))
     return var;
   if (const char *var = std::getenv("ROCM_ROOT"))
     return var;
   if (const char *var = std::getenv("ROCM_HOME"))
     return var;
   return __DEFAULT_ROCM_PATH__;
 }

 SerializeGPUModuleBase::SerializeGPUModuleBase(
     Operation &module, ROCDLTargetAttr target,
     const gpu::TargetOptions &targetOptions)
     : ModuleToObject(module, target.getTriple(), target.getChip(),
                      target.getFeatures(), target.getO()),
       target(target), toolkitPath(targetOptions.getToolkitPath()),
       librariesToLink(targetOptions.getLibrariesToLink()) {

   // If `targetOptions` has an empty toolkitPath use `getROCMPath`
   if (toolkitPath.empty())
     toolkitPath = getROCMPath();

   // Append the files in the target attribute.
   if (target.getLink())
     librariesToLink.append(target.getLink().begin(), target.getLink().end());
 }

 void SerializeGPUModuleBase::init() {
   static llvm::once_flag initializeBackendOnce;
   llvm::call_once(initializeBackendOnce, []() {
   // If the `AMDGPU` LLVM target was built, initialize it.
 #if MLIR_ENABLE_ROCM_CONVERSIONS
     LLVMInitializeAMDGPUTarget();
     LLVMInitializeAMDGPUTargetInfo();
     LLVMInitializeAMDGPUTargetMC();
     LLVMInitializeAMDGPUAsmParser();
     LLVMInitializeAMDGPUAsmPrinter();
 #endif
   });
 }

 ROCDLTargetAttr SerializeGPUModuleBase::getTarget() const { return target; }

 StringRef SerializeGPUModuleBase::getToolkitPath() const { return toolkitPath; }

 ArrayRef<Attribute> SerializeGPUModuleBase::getLibrariesToLink() const {
   return librariesToLink;
 }

 LogicalResult SerializeGPUModuleBase::appendStandardLibs(AMDGCNLibraries libs) {
   if (libs == AMDGCNLibraries::None)
     return success();
   StringRef pathRef = getToolkitPath();

   // Get the path for the device libraries
   SmallString<256> path;
   path.insert(path.begin(), pathRef.begin(), pathRef.end());
   llvm::sys::path::append(path, "amdgcn", "bitcode");
   pathRef = StringRef(path.data(), path.size());

   // Fail if the path is invalid.
   if (!llvm::sys::fs::is_directory(pathRef)) {
     getOperation().emitError() << "ROCm amdgcn bitcode path: " << pathRef
                                << " does not exist or is not a directory";
     return failure();
   }

   // Helper function for adding a library.
   auto addLib = [&](const Twine &lib) -> bool {
     auto baseSize = path.size();
     llvm::sys::path::append(path, lib);
     StringRef pathRef(path.data(), path.size());
     if (!llvm::sys::fs::is_regular_file(pathRef)) {
       getOperation().emitRemark() << "bitcode library path: " << pathRef
                                   << " does not exist or is not a file";
       return true;
     }
     librariesToLink.push_back(StringAttr::get(target.getContext(), pathRef));
     path.truncate(baseSize);
     return false;
   };

   // Add ROCm device libraries. Fail if any of the libraries is not found, ie.
   // if any of the `addLib` failed.
   if ((any(libs & AMDGCNLibraries::Ocml) && addLib("ocml.bc")) ||
       (any(libs & AMDGCNLibraries::Ockl) && addLib("ockl.bc")) ||
       (any(libs & AMDGCNLibraries::Hip) && addLib("hip.bc")) ||
       (any(libs & AMDGCNLibraries::OpenCL) && addLib("opencl.bc")))
     return failure();
   return success();
 }

 std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
 SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) {
   // Return if there are no libs to load.
   if (deviceLibs == AMDGCNLibraries::None && librariesToLink.empty())
     return SmallVector<std::unique_ptr<llvm::Module>>();
   if (failed(appendStandardLibs(deviceLibs)))
     return std::nullopt;
   SmallVector<std::unique_ptr<llvm::Module>> bcFiles;
   if (failed(loadBitcodeFilesFromList(module.getContext(), librariesToLink,
                                       bcFiles, true)))
     return std::nullopt;
   return std::move(bcFiles);
 }

 LogicalResult SerializeGPUModuleBase::handleBitcodeFile(llvm::Module &module) {
   // Some ROCM builds don't strip this like they should
   if (auto *openclVersion = module.getNamedMetadata("opencl.ocl.version"))
     module.eraseNamedMetadata(openclVersion);
   // Stop spamming us with clang version numbers
   if (auto *ident = module.getNamedMetadata("llvm.ident"))
     module.eraseNamedMetadata(ident);
   // Override the libModules datalayout and target triple with the compiler's
   // data layout should there be a discrepency.
   setDataLayoutAndTriple(module);
   return success();
 }

 void SerializeGPUModuleBase::handleModulePreLink(llvm::Module &module) {
   // If all libraries are not set, traverse the module to determine which
   // libraries are required.
   if (deviceLibs != AMDGCNLibraries::All) {
     for (llvm::Function &f : module.functions()) {
       if (f.hasExternalLinkage() && f.hasName() && !f.hasExactDefinition()) {
         StringRef funcName = f.getName();
         if ("printf" == funcName)
           deviceLibs |= AMDGCNLibraries::OpenCL | AMDGCNLibraries::Ockl |
                         AMDGCNLibraries::Ocml;
         if (funcName.starts_with("__ockl_"))
           deviceLibs |= AMDGCNLibraries::Ockl;
         if (funcName.starts_with("__ocml_"))
           deviceLibs |= AMDGCNLibraries::Ocml;
         if (funcName == "__atomic_work_item_fence")
           deviceLibs |= AMDGCNLibraries::Hip;
       }
     }
   }
   addControlVariables(module, deviceLibs, target.hasWave64(), target.hasDaz(),
                       target.hasFiniteOnly(), target.hasUnsafeMath(),
                       target.hasFastMath(), target.hasCorrectSqrt(),
                       target.getAbi());
 }

 void SerializeGPUModuleBase::addControlVariables(
     llvm::Module &module, AMDGCNLibraries libs, bool wave64, bool daz,
     bool finiteOnly, bool unsafeMath, bool fastMath, bool correctSqrt,
     StringRef abiVer) {
   // Helper function for adding control variables.
   auto addControlVariable = [&module](StringRef name, uint32_t value,
                                       uint32_t bitwidth) {
     if (module.getNamedGlobal(name))
       return;
     llvm::IntegerType *type =
         llvm::IntegerType::getIntNTy(module.getContext(), bitwidth);
     llvm::GlobalVariable *controlVariable = new llvm::GlobalVariable(
         module, /*isConstant=*/type, true,
         llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage,
         llvm::ConstantInt::get(type, value), name, /*before=*/nullptr,
         /*threadLocalMode=*/llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
         /*addressSpace=*/4);
     controlVariable->setVisibility(
         llvm::GlobalValue::VisibilityTypes::ProtectedVisibility);
     controlVariable->setAlignment(llvm::MaybeAlign(bitwidth / 8));
     controlVariable->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
   };

   // Note that COV6 requires ROCm 6.3+.
   int abi = 600;
   abiVer.getAsInteger(0, abi);
   module.addModuleFlag(llvm::Module::Error, "amdhsa_code_object_version", abi);
   // Return if no device libraries are required.
   if (libs == AMDGCNLibraries::None)
     return;
   // Add ocml related control variables.
   if (any(libs & AMDGCNLibraries::Ocml)) {
     addControlVariable("__oclc_finite_only_opt", finiteOnly || fastMath, 8);
     addControlVariable("__oclc_daz_opt", daz || fastMath, 8);
     addControlVariable("__oclc_correctly_rounded_sqrt32",
                        correctSqrt && !fastMath, 8);
     addControlVariable("__oclc_unsafe_math_opt", unsafeMath || fastMath, 8);
   }
   // Add ocml or ockl related control variables.
   if (any(libs & (AMDGCNLibraries::Ocml | AMDGCNLibraries::Ockl))) {
     addControlVariable("__oclc_wavefrontsize64", wave64, 8);
     // Get the ISA version.
     llvm::AMDGPU::IsaVersion isaVersion = llvm::AMDGPU::getIsaVersion(chip);
     // Add the ISA control variable.
     addControlVariable("__oclc_ISA_version",
                        isaVersion.Minor + 100 * isaVersion.Stepping +
                            1000 * isaVersion.Major,
                        32);
     addControlVariable("__oclc_ABI_version", abi, 32);
   }
 }

 std::optional<SmallVector<char, 0>>
 SerializeGPUModuleBase::assembleIsa(StringRef isa) {
   auto loc = getOperation().getLoc();

   StringRef targetTriple = this->triple;

   SmallVector<char, 0> result;
   llvm::raw_svector_ostream os(result);

   llvm::Triple triple(llvm::Triple::normalize(targetTriple));
   std::string error;
   const llvm::Target *target =
       llvm::TargetRegistry::lookupTarget(triple.normalize(), error);
   if (!target) {
     emitError(loc, Twine("failed to lookup target: ") + error);
     return std::nullopt;
   }

   llvm::SourceMgr srcMgr;
   srcMgr.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(isa), SMLoc());

   const llvm::MCTargetOptions mcOptions;
   std::unique_ptr<llvm::MCRegisterInfo> mri(
       target->createMCRegInfo(targetTriple));
   std::unique_ptr<llvm::MCAsmInfo> mai(
       target->createMCAsmInfo(*mri, targetTriple, mcOptions));
   std::unique_ptr<llvm::MCSubtargetInfo> sti(
       target->createMCSubtargetInfo(targetTriple, chip, features));

   llvm::MCContext ctx(triple, mai.get(), mri.get(), sti.get(), &srcMgr,
                       &mcOptions);
   std::unique_ptr<llvm::MCObjectFileInfo> mofi(target->createMCObjectFileInfo(
       ctx, /*PIC=*/false, /*LargeCodeModel=*/false));
   ctx.setObjectFileInfo(mofi.get());

   SmallString<128> cwd;
   if (!llvm::sys::fs::current_path(cwd))
     ctx.setCompilationDir(cwd);

   std::unique_ptr<llvm::MCStreamer> mcStreamer;
   std::unique_ptr<llvm::MCInstrInfo> mcii(target->createMCInstrInfo());

   llvm::MCCodeEmitter *ce = target->createMCCodeEmitter(*mcii, ctx);
   llvm::MCAsmBackend *mab = target->createMCAsmBackend(*sti, *mri, mcOptions);
   mcStreamer.reset(target->createMCObjectStreamer(
       triple, ctx, std::unique_ptr<llvm::MCAsmBackend>(mab),
       mab->createObjectWriter(os), std::unique_ptr<llvm::MCCodeEmitter>(ce),
       *sti));

   std::unique_ptr<llvm::MCAsmParser> parser(
       createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai));
   std::unique_ptr<llvm::MCTargetAsmParser> tap(
       target->createMCAsmParser(*sti, *parser, *mcii, mcOptions));

   if (!tap) {
     emitError(loc, "assembler initialization error");
     return std::nullopt;
   }

   parser->setTargetParser(*tap);
   parser->Run(false);
   return std::move(result);
 }

 std::optional<SmallVector<char, 0>>
 SerializeGPUModuleBase::compileToBinary(const std::string &serializedISA) {
   // Assemble the ISA.
   std::optional<SmallVector<char, 0>> isaBinary = assembleIsa(serializedISA);

   if (!isaBinary) {
     getOperation().emitError() << "failed during ISA assembling";
     return std::nullopt;
   }

   // Save the ISA binary to a temp file.
   int tempIsaBinaryFd = -1;
   SmallString<128> tempIsaBinaryFilename;
   if (llvm::sys::fs::createTemporaryFile("kernel%%", "o", tempIsaBinaryFd,
                                          tempIsaBinaryFilename)) {
     getOperation().emitError()
         << "failed to create a temporary file for dumping the ISA binary";
     return std::nullopt;
   }
   llvm::FileRemover cleanupIsaBinary(tempIsaBinaryFilename);
   {
     llvm::raw_fd_ostream tempIsaBinaryOs(tempIsaBinaryFd, true);
     tempIsaBinaryOs << StringRef(isaBinary->data(), isaBinary->size());
     tempIsaBinaryOs.flush();
   }

   // Create a temp file for HSA code object.
   SmallString<128> tempHsacoFilename;
   if (llvm::sys::fs::createTemporaryFile("kernel", "hsaco",
                                          tempHsacoFilename)) {
     getOperation().emitError()
         << "failed to create a temporary file for the HSA code object";
     return std::nullopt;
   }
   llvm::FileRemover cleanupHsaco(tempHsacoFilename);

   llvm::SmallString<128> lldPath(toolkitPath);
   llvm::sys::path::append(lldPath, "llvm", "bin", "ld.lld");
   int lldResult = llvm::sys::ExecuteAndWait(
       lldPath,
       {"ld.lld", "-shared", tempIsaBinaryFilename, "-o", tempHsacoFilename});
   if (lldResult != 0) {
     getOperation().emitError() << "lld invocation failed";
     return std::nullopt;
   }

   // Load the HSA code object.
   auto hsacoFile =
       llvm::MemoryBuffer::getFile(tempHsacoFilename, /*IsText=*/false);
   if (!hsacoFile) {
     getOperation().emitError()
         << "failed to read the HSA code object from the temp file";
     return std::nullopt;
   }

   StringRef buffer = (*hsacoFile)->getBuffer();

   return SmallVector<char, 0>(buffer.begin(), buffer.end());
 }

 std::optional<SmallVector<char, 0>> SerializeGPUModuleBase::moduleToObjectImpl(
     const gpu::TargetOptions &targetOptions, llvm::Module &llvmModule) {
   // Return LLVM IR if the compilation target is offload.
 #define DEBUG_TYPE "serialize-to-llvm"
   LLVM_DEBUG({
     llvm::dbgs() << "LLVM IR for module: "
                  << cast<gpu::GPUModuleOp>(getOperation()).getNameAttr() << "\n"
                  << llvmModule << "\n";
   });
 #undef DEBUG_TYPE
   if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload)
     return SerializeGPUModuleBase::moduleToObject(llvmModule);

   std::optional<llvm::TargetMachine *> targetMachine =
       getOrCreateTargetMachine();
   if (!targetMachine) {
     getOperation().emitError() << "target Machine unavailable for triple "
                                << triple << ", can't compile with LLVM";
     return std::nullopt;
   }

   // Translate the Module to ISA.
   std::optional<std::string> serializedISA =
       translateToISA(llvmModule, **targetMachine);
   if (!serializedISA) {
     getOperation().emitError() << "failed translating the module to ISA";
     return std::nullopt;
   }
 #define DEBUG_TYPE "serialize-to-isa"
   LLVM_DEBUG({
     llvm::dbgs() << "ISA for module: "
                  << cast<gpu::GPUModuleOp>(getOperation()).getNameAttr() << "\n"
                  << *serializedISA << "\n";
   });
 #undef DEBUG_TYPE
   // Return ISA assembly code if the compilation target is assembly.
   if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Assembly)
     return SmallVector<char, 0>(serializedISA->begin(), serializedISA->end());

   // Compiling to binary requires a valid ROCm path, fail if it's not found.
   if (getToolkitPath().empty()) {
     getOperation().emitError() << "invalid ROCm path, please set a valid path";
     return std::nullopt;
   }

   // Compile to binary.
   return compileToBinary(*serializedISA);
 }

 #if MLIR_ENABLE_ROCM_CONVERSIONS
 namespace {
 class AMDGPUSerializer : public SerializeGPUModuleBase {
 public:
   AMDGPUSerializer(Operation &module, ROCDLTargetAttr target,
                    const gpu::TargetOptions &targetOptions);

   std::optional<SmallVector<char, 0>>
   moduleToObject(llvm::Module &llvmModule) override;

 private:
   // Target options.
   gpu::TargetOptions targetOptions;
 };
 } // namespace

 AMDGPUSerializer::AMDGPUSerializer(Operation &module, ROCDLTargetAttr target,
                                    const gpu::TargetOptions &targetOptions)
     : SerializeGPUModuleBase(module, target, targetOptions),
       targetOptions(targetOptions) {}

 std::optional<SmallVector<char, 0>>
 AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule) {
   return moduleToObjectImpl(targetOptions, llvmModule);
 }
 #endif // MLIR_ENABLE_ROCM_CONVERSIONS

 std::optional<SmallVector<char, 0>> ROCDLTargetAttrImpl::serializeToObject(
     Attribute attribute, Operation *module,
     const gpu::TargetOptions &options) const {
   assert(module && "The module must be non null.");
   if (!module)
     return std::nullopt;
   if (!mlir::isa<gpu::GPUModuleOp>(module)) {
     module->emitError("module must be a GPU module");
     return std::nullopt;
   }
 #if MLIR_ENABLE_ROCM_CONVERSIONS
   AMDGPUSerializer serializer(*module, cast<ROCDLTargetAttr>(attribute),
                               options);
   serializer.init();
   return serializer.run();
 #else
   module->emitError("the `AMDGPU` target was not built. Please enable it when "
                     "building LLVM");
   return std::nullopt;
 #endif // MLIR_ENABLE_ROCM_CONVERSIONS
 }

 Attribute
 ROCDLTargetAttrImpl::createObject(Attribute attribute, Operation *module,
                                   const SmallVector<char, 0> &object,
                                   const gpu::TargetOptions &options) const {
   gpu::CompilationTarget format = options.getCompilationTarget();
   // If format is `fatbin` transform it to binary as `fatbin` is not yet
   // supported.
   gpu::KernelTableAttr kernels;
   if (format > gpu::CompilationTarget::Binary) {
     format = gpu::CompilationTarget::Binary;
     kernels = ROCDL::getKernelMetadata(module, object);
   }
   DictionaryAttr properties{};
   Builder builder(attribute.getContext());
   StringAttr objectStr =
       builder.getStringAttr(StringRef(object.data(), object.size()));
   return builder.getAttr<gpu::ObjectAttr>(attribute, format, objectStr,
                                           properties, kernels);
 }
	//===- Target.cpp - MLIR LLVM ROCDL target compilation ----------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This files defines ROCDL target related functions including registration
	// calls for the `#rocdl.target` compilation attribute.
	//
	//===----------------------------------------------------------------------===//

	#include "mlir/Target/LLVM/ROCDL/Target.h"

	#include "mlir/Dialect/GPU/IR/GPUDialect.h"
	#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
	#include "mlir/Support/FileUtilities.h"
	#include "mlir/Target/LLVM/ROCDL/Utils.h"
	#include "mlir/Target/LLVMIR/Export.h"

	#include "llvm/IR/Constants.h"
	#include "llvm/MC/MCAsmBackend.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCCodeEmitter.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCInstrInfo.h"
	#include "llvm/MC/MCObjectFileInfo.h"
	#include "llvm/MC/MCObjectWriter.h"
	#include "llvm/MC/MCParser/MCTargetAsmParser.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/MC/MCStreamer.h"
	#include "llvm/MC/MCSubtargetInfo.h"
	#include "llvm/MC/TargetRegistry.h"
	#include "llvm/Support/FileSystem.h"
	#include "llvm/Support/FileUtilities.h"
	#include "llvm/Support/Path.h"
	#include "llvm/Support/Program.h"
	#include "llvm/Support/SourceMgr.h"
	#include "llvm/Support/TargetSelect.h"
	#include "llvm/TargetParser/TargetParser.h"

	#include <cstdlib>
	#include <optional>

	using namespace mlir;
	using namespace mlir::ROCDL;

	#ifndef __DEFAULT_ROCM_PATH__
	#define __DEFAULT_ROCM_PATH__ ""
	#endif

	namespace {
	// Implementation of the `TargetAttrInterface` model.
	class ROCDLTargetAttrImpl
	: public gpu::TargetAttrInterface::FallbackModel<ROCDLTargetAttrImpl> {
	public:
	std::optional<SmallVector<char, 0>>
	serializeToObject(Attribute attribute, Operation *module,
	const gpu::TargetOptions &options) const;

	Attribute createObject(Attribute attribute, Operation *module,
	const SmallVector<char, 0> &object,
	const gpu::TargetOptions &options) const;
	};
	} // namespace

	// Register the ROCDL dialect, the ROCDL translation and the target interface.
	void mlir::ROCDL::registerROCDLTargetInterfaceExternalModels(
	DialectRegistry &registry) {
	registry.addExtension(+[](MLIRContext ctx, ROCDL::ROCDLDialect dialect) {
	ROCDLTargetAttr::attachInterface<ROCDLTargetAttrImpl>(*ctx);
	});
	}

	void mlir::ROCDL::registerROCDLTargetInterfaceExternalModels(
	MLIRContext &context) {
	DialectRegistry registry;
	registerROCDLTargetInterfaceExternalModels(registry);
	context.appendDialectRegistry(registry);
	}

	// Search for the ROCM path.
	StringRef mlir::ROCDL::getROCMPath() {
	if (const char *var = std::getenv("ROCM_PATH"))
	return var;
	if (const char *var = std::getenv("ROCM_ROOT"))
	return var;
	if (const char *var = std::getenv("ROCM_HOME"))
	return var;
	return __DEFAULT_ROCM_PATH__;
	}

	SerializeGPUModuleBase::SerializeGPUModuleBase(
	Operation &module, ROCDLTargetAttr target,
	const gpu::TargetOptions &targetOptions)
	: ModuleToObject(module, target.getTriple(), target.getChip(),
	target.getFeatures(), target.getO()),
	target(target), toolkitPath(targetOptions.getToolkitPath()),
	librariesToLink(targetOptions.getLibrariesToLink()) {

	// If `targetOptions` has an empty toolkitPath use `getROCMPath`
	if (toolkitPath.empty())
	toolkitPath = getROCMPath();

	// Append the files in the target attribute.
	if (target.getLink())
	librariesToLink.append(target.getLink().begin(), target.getLink().end());
	}

	void SerializeGPUModuleBase::init() {
	static llvm::once_flag initializeBackendOnce;
	llvm::call_once(initializeBackendOnce, []() {
	// If the `AMDGPU` LLVM target was built, initialize it.
	#if MLIR_ENABLE_ROCM_CONVERSIONS
	LLVMInitializeAMDGPUTarget();
	LLVMInitializeAMDGPUTargetInfo();
	LLVMInitializeAMDGPUTargetMC();
	LLVMInitializeAMDGPUAsmParser();
	LLVMInitializeAMDGPUAsmPrinter();
	#endif
	});
	}

	ROCDLTargetAttr SerializeGPUModuleBase::getTarget() const { return target; }

	StringRef SerializeGPUModuleBase::getToolkitPath() const { return toolkitPath; }

	ArrayRef<Attribute> SerializeGPUModuleBase::getLibrariesToLink() const {
	return librariesToLink;
	}

	LogicalResult SerializeGPUModuleBase::appendStandardLibs(AMDGCNLibraries libs) {
	if (libs == AMDGCNLibraries::None)
	return success();
	StringRef pathRef = getToolkitPath();

	// Get the path for the device libraries
	SmallString<256> path;
	path.insert(path.begin(), pathRef.begin(), pathRef.end());
	llvm::sys::path::append(path, "amdgcn", "bitcode");
	pathRef = StringRef(path.data(), path.size());

	// Fail if the path is invalid.
	if (!llvm::sys::fs::is_directory(pathRef)) {
	getOperation().emitError() << "ROCm amdgcn bitcode path: " << pathRef
	<< " does not exist or is not a directory";
	return failure();
	}

	// Helper function for adding a library.
	auto addLib = [&](const Twine &lib) -> bool {
	auto baseSize = path.size();
	llvm::sys::path::append(path, lib);
	StringRef pathRef(path.data(), path.size());
	if (!llvm::sys::fs::is_regular_file(pathRef)) {
	getOperation().emitRemark() << "bitcode library path: " << pathRef
	<< " does not exist or is not a file";
	return true;
	}
	librariesToLink.push_back(StringAttr::get(target.getContext(), pathRef));
	path.truncate(baseSize);
	return false;
	};

	// Add ROCm device libraries. Fail if any of the libraries is not found, ie.
	// if any of the `addLib` failed.
	if ((any(libs & AMDGCNLibraries::Ocml) && addLib("ocml.bc")) \|\|
	(any(libs & AMDGCNLibraries::Ockl) && addLib("ockl.bc")) \|\|
	(any(libs & AMDGCNLibraries::Hip) && addLib("hip.bc")) \|\|
	(any(libs & AMDGCNLibraries::OpenCL) && addLib("opencl.bc")))
	return failure();
	return success();
	}

	std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
	SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) {
	// Return if there are no libs to load.
	if (deviceLibs == AMDGCNLibraries::None && librariesToLink.empty())
	return SmallVector<std::unique_ptr<llvm::Module>>();
	if (failed(appendStandardLibs(deviceLibs)))
	return std::nullopt;
	SmallVector<std::unique_ptr<llvm::Module>> bcFiles;
	if (failed(loadBitcodeFilesFromList(module.getContext(), librariesToLink,
	bcFiles, true)))
	return std::nullopt;
	return std::move(bcFiles);
	}

	LogicalResult SerializeGPUModuleBase::handleBitcodeFile(llvm::Module &module) {
	// Some ROCM builds don't strip this like they should
	if (auto *openclVersion = module.getNamedMetadata("opencl.ocl.version"))
	module.eraseNamedMetadata(openclVersion);
	// Stop spamming us with clang version numbers
	if (auto *ident = module.getNamedMetadata("llvm.ident"))
	module.eraseNamedMetadata(ident);
	// Override the libModules datalayout and target triple with the compiler's
	// data layout should there be a discrepency.
	setDataLayoutAndTriple(module);
	return success();
	}

	void SerializeGPUModuleBase::handleModulePreLink(llvm::Module &module) {
	// If all libraries are not set, traverse the module to determine which
	// libraries are required.
	if (deviceLibs != AMDGCNLibraries::All) {
	for (llvm::Function &f : module.functions()) {
	if (f.hasExternalLinkage() && f.hasName() && !f.hasExactDefinition()) {
	StringRef funcName = f.getName();
	if ("printf" == funcName)
	deviceLibs \|= AMDGCNLibraries::OpenCL \| AMDGCNLibraries::Ockl \|
	AMDGCNLibraries::Ocml;
	if (funcName.starts_with("__ockl_"))
	deviceLibs \|= AMDGCNLibraries::Ockl;
	if (funcName.starts_with("__ocml_"))
	deviceLibs \|= AMDGCNLibraries::Ocml;
	if (funcName == "__atomic_work_item_fence")
	deviceLibs \|= AMDGCNLibraries::Hip;
	}
	}
	}
	addControlVariables(module, deviceLibs, target.hasWave64(), target.hasDaz(),
	target.hasFiniteOnly(), target.hasUnsafeMath(),
	target.hasFastMath(), target.hasCorrectSqrt(),
	target.getAbi());
	}

	void SerializeGPUModuleBase::addControlVariables(
	llvm::Module &module, AMDGCNLibraries libs, bool wave64, bool daz,
	bool finiteOnly, bool unsafeMath, bool fastMath, bool correctSqrt,
	StringRef abiVer) {
	// Helper function for adding control variables.
	auto addControlVariable = [&module](StringRef name, uint32_t value,
	uint32_t bitwidth) {
	if (module.getNamedGlobal(name))
	return;
	llvm::IntegerType *type =
	llvm::IntegerType::getIntNTy(module.getContext(), bitwidth);
	llvm::GlobalVariable *controlVariable = new llvm::GlobalVariable(
	module, /isConstant=/type, true,
	llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage,
	llvm::ConstantInt::get(type, value), name, /before=/nullptr,
	/threadLocalMode=/llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
	/addressSpace=/4);
	controlVariable->setVisibility(
	llvm::GlobalValue::VisibilityTypes::ProtectedVisibility);
	controlVariable->setAlignment(llvm::MaybeAlign(bitwidth / 8));
	controlVariable->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
	};

	// Note that COV6 requires ROCm 6.3+.
	int abi = 600;
	abiVer.getAsInteger(0, abi);
	module.addModuleFlag(llvm::Module::Error, "amdhsa_code_object_version", abi);
	// Return if no device libraries are required.
	if (libs == AMDGCNLibraries::None)
	return;
	// Add ocml related control variables.
	if (any(libs & AMDGCNLibraries::Ocml)) {
	addControlVariable("__oclc_finite_only_opt", finiteOnly \|\| fastMath, 8);
	addControlVariable("__oclc_daz_opt", daz \|\| fastMath, 8);
	addControlVariable("__oclc_correctly_rounded_sqrt32",
	correctSqrt && !fastMath, 8);
	addControlVariable("__oclc_unsafe_math_opt", unsafeMath \|\| fastMath, 8);
	}
	// Add ocml or ockl related control variables.
	if (any(libs & (AMDGCNLibraries::Ocml \| AMDGCNLibraries::Ockl))) {
	addControlVariable("__oclc_wavefrontsize64", wave64, 8);
	// Get the ISA version.
	llvm::AMDGPU::IsaVersion isaVersion = llvm::AMDGPU::getIsaVersion(chip);
	// Add the ISA control variable.
	addControlVariable("__oclc_ISA_version",
	isaVersion.Minor + 100 * isaVersion.Stepping +
	1000 * isaVersion.Major,
	32);
	addControlVariable("__oclc_ABI_version", abi, 32);
	}
	}

	std::optional<SmallVector<char, 0>>
	SerializeGPUModuleBase::assembleIsa(StringRef isa) {
	auto loc = getOperation().getLoc();

	StringRef targetTriple = this->triple;

	SmallVector<char, 0> result;
	llvm::raw_svector_ostream os(result);

	llvm::Triple triple(llvm::Triple::normalize(targetTriple));
	std::string error;
	const llvm::Target *target =
	llvm::TargetRegistry::lookupTarget(triple.normalize(), error);
	if (!target) {
	emitError(loc, Twine("failed to lookup target: ") + error);
	return std::nullopt;
	}

	llvm::SourceMgr srcMgr;
	srcMgr.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(isa), SMLoc());

	const llvm::MCTargetOptions mcOptions;
	std::unique_ptr<llvm::MCRegisterInfo> mri(
	target->createMCRegInfo(targetTriple));
	std::unique_ptr<llvm::MCAsmInfo> mai(
	target->createMCAsmInfo(*mri, targetTriple, mcOptions));
	std::unique_ptr<llvm::MCSubtargetInfo> sti(
	target->createMCSubtargetInfo(targetTriple, chip, features));

	llvm::MCContext ctx(triple, mai.get(), mri.get(), sti.get(), &srcMgr,
	&mcOptions);
	std::unique_ptr<llvm::MCObjectFileInfo> mofi(target->createMCObjectFileInfo(
	ctx, /PIC=/false, /LargeCodeModel=/false));
	ctx.setObjectFileInfo(mofi.get());

	SmallString<128> cwd;
	if (!llvm::sys::fs::current_path(cwd))
	ctx.setCompilationDir(cwd);

	std::unique_ptr<llvm::MCStreamer> mcStreamer;
	std::unique_ptr<llvm::MCInstrInfo> mcii(target->createMCInstrInfo());

	llvm::MCCodeEmitter ce = target->createMCCodeEmitter(mcii, ctx);
	llvm::MCAsmBackend mab = target->createMCAsmBackend(sti, *mri, mcOptions);
	mcStreamer.reset(target->createMCObjectStreamer(
	triple, ctx, std::unique_ptr<llvm::MCAsmBackend>(mab),
	mab->createObjectWriter(os), std::unique_ptr<llvm::MCCodeEmitter>(ce),
	*sti));

	std::unique_ptr<llvm::MCAsmParser> parser(
	createMCAsmParser(srcMgr, ctx, mcStreamer, mai));
	std::unique_ptr<llvm::MCTargetAsmParser> tap(
	target->createMCAsmParser(sti, parser, *mcii, mcOptions));

	if (!tap) {
	emitError(loc, "assembler initialization error");
	return std::nullopt;
	}

	parser->setTargetParser(*tap);
	parser->Run(false);
	return std::move(result);
	}

	std::optional<SmallVector<char, 0>>
	SerializeGPUModuleBase::compileToBinary(const std::string &serializedISA) {
	// Assemble the ISA.
	std::optional<SmallVector<char, 0>> isaBinary = assembleIsa(serializedISA);

	if (!isaBinary) {
	getOperation().emitError() << "failed during ISA assembling";
	return std::nullopt;
	}

	// Save the ISA binary to a temp file.
	int tempIsaBinaryFd = -1;
	SmallString<128> tempIsaBinaryFilename;
	if (llvm::sys::fs::createTemporaryFile("kernel%%", "o", tempIsaBinaryFd,
	tempIsaBinaryFilename)) {
	getOperation().emitError()
	<< "failed to create a temporary file for dumping the ISA binary";
	return std::nullopt;
	}
	llvm::FileRemover cleanupIsaBinary(tempIsaBinaryFilename);
	{
	llvm::raw_fd_ostream tempIsaBinaryOs(tempIsaBinaryFd, true);
	tempIsaBinaryOs << StringRef(isaBinary->data(), isaBinary->size());
	tempIsaBinaryOs.flush();
	}

	// Create a temp file for HSA code object.
	SmallString<128> tempHsacoFilename;
	if (llvm::sys::fs::createTemporaryFile("kernel", "hsaco",
	tempHsacoFilename)) {
	getOperation().emitError()
	<< "failed to create a temporary file for the HSA code object";
	return std::nullopt;
	}
	llvm::FileRemover cleanupHsaco(tempHsacoFilename);

	llvm::SmallString<128> lldPath(toolkitPath);
	llvm::sys::path::append(lldPath, "llvm", "bin", "ld.lld");
	int lldResult = llvm::sys::ExecuteAndWait(
	lldPath,
	{"ld.lld", "-shared", tempIsaBinaryFilename, "-o", tempHsacoFilename});
	if (lldResult != 0) {
	getOperation().emitError() << "lld invocation failed";
	return std::nullopt;
	}

	// Load the HSA code object.
	auto hsacoFile =
	llvm::MemoryBuffer::getFile(tempHsacoFilename, /IsText=/false);
	if (!hsacoFile) {
	getOperation().emitError()
	<< "failed to read the HSA code object from the temp file";
	return std::nullopt;
	}

	StringRef buffer = (*hsacoFile)->getBuffer();

	return SmallVector<char, 0>(buffer.begin(), buffer.end());
	}

	std::optional<SmallVector<char, 0>> SerializeGPUModuleBase::moduleToObjectImpl(
	const gpu::TargetOptions &targetOptions, llvm::Module &llvmModule) {
	// Return LLVM IR if the compilation target is offload.
	#define DEBUG_TYPE "serialize-to-llvm"
	LLVM_DEBUG({
	llvm::dbgs() << "LLVM IR for module: "
	<< cast<gpu::GPUModuleOp>(getOperation()).getNameAttr() << "\n"
	<< llvmModule << "\n";
	});
	#undef DEBUG_TYPE
	if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload)
	return SerializeGPUModuleBase::moduleToObject(llvmModule);

	std::optional<llvm::TargetMachine *> targetMachine =
	getOrCreateTargetMachine();
	if (!targetMachine) {
	getOperation().emitError() << "target Machine unavailable for triple "
	<< triple << ", can't compile with LLVM";
	return std::nullopt;
	}

	// Translate the Module to ISA.
	std::optional<std::string> serializedISA =
	translateToISA(llvmModule, **targetMachine);
	if (!serializedISA) {
	getOperation().emitError() << "failed translating the module to ISA";
	return std::nullopt;
	}
	#define DEBUG_TYPE "serialize-to-isa"
	LLVM_DEBUG({
	llvm::dbgs() << "ISA for module: "
	<< cast<gpu::GPUModuleOp>(getOperation()).getNameAttr() << "\n"
	<< *serializedISA << "\n";
	});
	#undef DEBUG_TYPE
	// Return ISA assembly code if the compilation target is assembly.
	if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Assembly)
	return SmallVector<char, 0>(serializedISA->begin(), serializedISA->end());

	// Compiling to binary requires a valid ROCm path, fail if it's not found.
	if (getToolkitPath().empty()) {
	getOperation().emitError() << "invalid ROCm path, please set a valid path";
	return std::nullopt;
	}

	// Compile to binary.
	return compileToBinary(*serializedISA);
	}

	#if MLIR_ENABLE_ROCM_CONVERSIONS
	namespace {
	class AMDGPUSerializer : public SerializeGPUModuleBase {
	public:
	AMDGPUSerializer(Operation &module, ROCDLTargetAttr target,
	const gpu::TargetOptions &targetOptions);

	std::optional<SmallVector<char, 0>>
	moduleToObject(llvm::Module &llvmModule) override;

	private:
	// Target options.
	gpu::TargetOptions targetOptions;
	};
	} // namespace

	AMDGPUSerializer::AMDGPUSerializer(Operation &module, ROCDLTargetAttr target,
	const gpu::TargetOptions &targetOptions)
	: SerializeGPUModuleBase(module, target, targetOptions),
	targetOptions(targetOptions) {}

	std::optional<SmallVector<char, 0>>
	AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule) {
	return moduleToObjectImpl(targetOptions, llvmModule);
	}
	#endif // MLIR_ENABLE_ROCM_CONVERSIONS

	std::optional<SmallVector<char, 0>> ROCDLTargetAttrImpl::serializeToObject(
	Attribute attribute, Operation *module,
	const gpu::TargetOptions &options) const {
	assert(module && "The module must be non null.");
	if (!module)
	return std::nullopt;
	if (!mlir::isa<gpu::GPUModuleOp>(module)) {
	module->emitError("module must be a GPU module");
	return std::nullopt;
	}
	#if MLIR_ENABLE_ROCM_CONVERSIONS
	AMDGPUSerializer serializer(*module, cast<ROCDLTargetAttr>(attribute),
	options);
	serializer.init();
	return serializer.run();
	#else
	module->emitError("the `AMDGPU` target was not built. Please enable it when "
	"building LLVM");
	return std::nullopt;
	#endif // MLIR_ENABLE_ROCM_CONVERSIONS
	}

	Attribute
	ROCDLTargetAttrImpl::createObject(Attribute attribute, Operation *module,
	const SmallVector<char, 0> &object,
	const gpu::TargetOptions &options) const {
	gpu::CompilationTarget format = options.getCompilationTarget();
	// If format is `fatbin` transform it to binary as `fatbin` is not yet
	// supported.
	gpu::KernelTableAttr kernels;
	if (format > gpu::CompilationTarget::Binary) {
	format = gpu::CompilationTarget::Binary;
	kernels = ROCDL::getKernelMetadata(module, object);
	}
	DictionaryAttr properties{};
	Builder builder(attribute.getContext());
	StringAttr objectStr =
	builder.getStringAttr(StringRef(object.data(), object.size()));
	return builder.getAttr<gpu::ObjectAttr>(attribute, format, objectStr,
	properties, kernels);
	}