| //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===// |
| // |
| // The LLVM Compiler Infrastructure |
| // |
| // This file is distributed under the University of Illinois Open Source |
| // License. See LICENSE.TXT for details. |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // Top-level implementation for the NVPTX target. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "NVPTXTargetMachine.h" |
| #include "MCTargetDesc/NVPTXMCAsmInfo.h" |
| #include "NVPTX.h" |
| #include "NVPTXAllocaHoisting.h" |
| #include "NVPTXLowerAggrCopies.h" |
| #include "NVPTXTargetObjectFile.h" |
| #include "NVPTXTargetTransformInfo.h" |
| #include "llvm/Analysis/Passes.h" |
| #include "llvm/CodeGen/AsmPrinter.h" |
| #include "llvm/CodeGen/MachineFunctionAnalysis.h" |
| #include "llvm/CodeGen/MachineModuleInfo.h" |
| #include "llvm/CodeGen/Passes.h" |
| #include "llvm/IR/DataLayout.h" |
| #include "llvm/IR/IRPrintingPasses.h" |
| #include "llvm/IR/LegacyPassManager.h" |
| #include "llvm/IR/Verifier.h" |
| #include "llvm/MC/MCAsmInfo.h" |
| #include "llvm/MC/MCInstrInfo.h" |
| #include "llvm/MC/MCStreamer.h" |
| #include "llvm/MC/MCSubtargetInfo.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/FormattedStream.h" |
| #include "llvm/Support/TargetRegistry.h" |
| #include "llvm/Support/raw_ostream.h" |
| #include "llvm/Target/TargetInstrInfo.h" |
| #include "llvm/Target/TargetLowering.h" |
| #include "llvm/Target/TargetLoweringObjectFile.h" |
| #include "llvm/Target/TargetMachine.h" |
| #include "llvm/Target/TargetOptions.h" |
| #include "llvm/Target/TargetRegisterInfo.h" |
| #include "llvm/Target/TargetSubtargetInfo.h" |
| #include "llvm/Transforms/Scalar.h" |
| |
| using namespace llvm; |
| |
| namespace llvm { |
| void initializeNVVMReflectPass(PassRegistry&); |
| void initializeGenericToNVVMPass(PassRegistry&); |
| void initializeNVPTXAllocaHoistingPass(PassRegistry &); |
| void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&); |
| void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &); |
| void initializeNVPTXLowerKernelArgsPass(PassRegistry &); |
| void initializeNVPTXLowerAllocaPass(PassRegistry &); |
| } |
| |
| extern "C" void LLVMInitializeNVPTXTarget() { |
| // Register the target. |
| RegisterTargetMachine<NVPTXTargetMachine32> X(TheNVPTXTarget32); |
| RegisterTargetMachine<NVPTXTargetMachine64> Y(TheNVPTXTarget64); |
| |
| // FIXME: This pass is really intended to be invoked during IR optimization, |
| // but it's very NVPTX-specific. |
| initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); |
| initializeGenericToNVVMPass(*PassRegistry::getPassRegistry()); |
| initializeNVPTXAllocaHoistingPass(*PassRegistry::getPassRegistry()); |
| initializeNVPTXAssignValidGlobalNamesPass(*PassRegistry::getPassRegistry()); |
| initializeNVPTXFavorNonGenericAddrSpacesPass( |
| *PassRegistry::getPassRegistry()); |
| initializeNVPTXLowerKernelArgsPass(*PassRegistry::getPassRegistry()); |
| initializeNVPTXLowerAllocaPass(*PassRegistry::getPassRegistry()); |
| } |
| |
| static std::string computeDataLayout(bool is64Bit) { |
| std::string Ret = "e"; |
| |
| if (!is64Bit) |
| Ret += "-p:32:32"; |
| |
| Ret += "-i64:64-v16:16-v32:32-n16:32:64"; |
| |
| return Ret; |
| } |
| |
| NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, |
| StringRef CPU, StringRef FS, |
| const TargetOptions &Options, |
| Reloc::Model RM, CodeModel::Model CM, |
| CodeGenOpt::Level OL, bool is64bit) |
| : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options, RM, |
| CM, OL), |
| is64bit(is64bit), TLOF(make_unique<NVPTXTargetObjectFile>()), |
| Subtarget(TT, CPU, FS, *this) { |
| if (TT.getOS() == Triple::NVCL) |
| drvInterface = NVPTX::NVCL; |
| else |
| drvInterface = NVPTX::CUDA; |
| initAsmInfo(); |
| } |
| |
| NVPTXTargetMachine::~NVPTXTargetMachine() {} |
| |
| void NVPTXTargetMachine32::anchor() {} |
| |
| NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT, |
| StringRef CPU, StringRef FS, |
| const TargetOptions &Options, |
| Reloc::Model RM, CodeModel::Model CM, |
| CodeGenOpt::Level OL) |
| : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} |
| |
| void NVPTXTargetMachine64::anchor() {} |
| |
| NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT, |
| StringRef CPU, StringRef FS, |
| const TargetOptions &Options, |
| Reloc::Model RM, CodeModel::Model CM, |
| CodeGenOpt::Level OL) |
| : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} |
| |
| namespace { |
| class NVPTXPassConfig : public TargetPassConfig { |
| public: |
| NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM) |
| : TargetPassConfig(TM, PM) {} |
| |
| NVPTXTargetMachine &getNVPTXTargetMachine() const { |
| return getTM<NVPTXTargetMachine>(); |
| } |
| |
| void addIRPasses() override; |
| bool addInstSelector() override; |
| void addPostRegAlloc() override; |
| void addMachineSSAOptimization() override; |
| |
| FunctionPass *createTargetRegisterAllocator(bool) override; |
| void addFastRegAlloc(FunctionPass *RegAllocPass) override; |
| void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; |
| }; |
| } // end anonymous namespace |
| |
| TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { |
| NVPTXPassConfig *PassConfig = new NVPTXPassConfig(this, PM); |
| return PassConfig; |
| } |
| |
| TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() { |
| return TargetIRAnalysis([this](Function &F) { |
| return TargetTransformInfo(NVPTXTTIImpl(this, F)); |
| }); |
| } |
| |
| void NVPTXPassConfig::addIRPasses() { |
| // The following passes are known to not play well with virtual regs hanging |
| // around after register allocation (which in our case, is *all* registers). |
| // We explicitly disable them here. We do, however, need some functionality |
| // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the |
| // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). |
| disablePass(&PrologEpilogCodeInserterID); |
| disablePass(&MachineCopyPropagationID); |
| disablePass(&BranchFolderPassID); |
| disablePass(&TailDuplicateID); |
| |
| addPass(createNVPTXImageOptimizerPass()); |
| TargetPassConfig::addIRPasses(); |
| addPass(createNVPTXAssignValidGlobalNamesPass()); |
| addPass(createGenericToNVVMPass()); |
| addPass(createNVPTXLowerKernelArgsPass(&getNVPTXTargetMachine())); |
| // NVPTXLowerKernelArgs emits alloca for byval parameters which can often |
| // be eliminated by SROA. |
| addPass(createSROAPass()); |
| addPass(createNVPTXLowerAllocaPass()); |
| addPass(createNVPTXFavorNonGenericAddrSpacesPass()); |
| // FavorNonGenericAddrSpaces shortcuts unnecessary addrspacecasts, and leave |
| // them unused. We could remove dead code in an ad-hoc manner, but that |
| // requires manual work and might be error-prone. |
| addPass(createDeadCodeEliminationPass()); |
| addPass(createSeparateConstOffsetFromGEPPass()); |
| // ReassociateGEPs exposes more opportunites for SLSR. See |
| // the example in reassociate-geps-and-slsr.ll. |
| addPass(createStraightLineStrengthReducePass()); |
| // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or |
| // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE |
| // for some of our benchmarks. |
| if (getOptLevel() == CodeGenOpt::Aggressive) |
| addPass(createGVNPass()); |
| else |
| addPass(createEarlyCSEPass()); |
| // Run NaryReassociate after EarlyCSE/GVN to be more effective. |
| addPass(createNaryReassociatePass()); |
| // NaryReassociate on GEPs creates redundant common expressions, so run |
| // EarlyCSE after it. |
| addPass(createEarlyCSEPass()); |
| } |
| |
| bool NVPTXPassConfig::addInstSelector() { |
| const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); |
| |
| addPass(createLowerAggrCopies()); |
| addPass(createAllocaHoisting()); |
| addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel())); |
| |
| if (!ST.hasImageHandles()) |
| addPass(createNVPTXReplaceImageHandlesPass()); |
| |
| return false; |
| } |
| |
| void NVPTXPassConfig::addPostRegAlloc() { |
| addPass(createNVPTXPrologEpilogPass(), false); |
| // NVPTXPrologEpilogPass calculates frame object offset and replace frame |
| // index with VRFrame register. NVPTXPeephole need to be run after that and |
| // will replace VRFrame with VRFrameLocal when possible. |
| addPass(createNVPTXPeephole()); |
| } |
| |
| FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) { |
| return nullptr; // No reg alloc |
| } |
| |
| void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { |
| assert(!RegAllocPass && "NVPTX uses no regalloc!"); |
| addPass(&PHIEliminationID); |
| addPass(&TwoAddressInstructionPassID); |
| } |
| |
| void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { |
| assert(!RegAllocPass && "NVPTX uses no regalloc!"); |
| |
| addPass(&ProcessImplicitDefsID); |
| addPass(&LiveVariablesID); |
| addPass(&MachineLoopInfoID); |
| addPass(&PHIEliminationID); |
| |
| addPass(&TwoAddressInstructionPassID); |
| addPass(&RegisterCoalescerID); |
| |
| // PreRA instruction scheduling. |
| if (addPass(&MachineSchedulerID)) |
| printAndVerify("After Machine Scheduling"); |
| |
| |
| addPass(&StackSlotColoringID); |
| |
| // FIXME: Needs physical registers |
| //addPass(&PostRAMachineLICMID); |
| |
| printAndVerify("After StackSlotColoring"); |
| } |
| |
| void NVPTXPassConfig::addMachineSSAOptimization() { |
| // Pre-ra tail duplication. |
| if (addPass(&EarlyTailDuplicateID)) |
| printAndVerify("After Pre-RegAlloc TailDuplicate"); |
| |
| // Optimize PHIs before DCE: removing dead PHI cycles may make more |
| // instructions dead. |
| addPass(&OptimizePHIsID); |
| |
| // This pass merges large allocas. StackSlotColoring is a different pass |
| // which merges spill slots. |
| addPass(&StackColoringID); |
| |
| // If the target requests it, assign local variables to stack slots relative |
| // to one another and simplify frame index references where possible. |
| addPass(&LocalStackSlotAllocationID); |
| |
| // With optimization, dead code should already be eliminated. However |
| // there is one known exception: lowered code for arguments that are only |
| // used by tail calls, where the tail calls reuse the incoming stack |
| // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll). |
| addPass(&DeadMachineInstructionElimID); |
| printAndVerify("After codegen DCE pass"); |
| |
| // Allow targets to insert passes that improve instruction level parallelism, |
| // like if-conversion. Such passes will typically need dominator trees and |
| // loop info, just like LICM and CSE below. |
| if (addILPOpts()) |
| printAndVerify("After ILP optimizations"); |
| |
| addPass(&MachineLICMID); |
| addPass(&MachineCSEID); |
| |
| addPass(&MachineSinkingID); |
| printAndVerify("After Machine LICM, CSE and Sinking passes"); |
| |
| addPass(&PeepholeOptimizerID); |
| printAndVerify("After codegen peephole optimization pass"); |
| } |