| //===------ PTXGenerator.cpp - IR helper to create loops -----------------===// |
| // |
| // The LLVM Compiler Infrastructure |
| // |
| // This file is distributed under the University of Illinois Open Source |
| // License. See LICENSE.TXT for details. |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file contains functions to create GPU parallel codes as LLVM-IR. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "polly/CodeGen/PTXGenerator.h" |
| |
| #ifdef GPU_CODEGEN |
| #include "polly/ScopDetection.h" |
| #include "polly/ScopInfo.h" |
| |
| #include "llvm/PassManager.h" |
| #include "llvm/ADT/SetVector.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/FormattedStream.h" |
| #include "llvm/Support/TargetRegistry.h" |
| #include "llvm/Support/TargetSelect.h" |
| #include "llvm/IR/DataLayout.h" |
| #include "llvm/IR/Dominators.h" |
| #include "llvm/IR/Intrinsics.h" |
| #include "llvm/IR/Module.h" |
| #include "llvm/Target/TargetLibraryInfo.h" |
| #include "llvm/Target/TargetMachine.h" |
| #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
| #include "llvm/Transforms/Utils/Cloning.h" |
| |
| using namespace llvm; |
| using namespace polly; |
| |
| PTXGenerator::PTXGenerator(PollyIRBuilder &Builder, Pass *P, |
| const std::string &Triple) |
| : Builder(Builder), P(P), GPUTriple(Triple), GridWidth(1), GridHeight(1), |
| BlockWidth(1), BlockHeight(1), OutputBytes(0) { |
| InitializeGPUDataTypes(); |
| } |
| |
| Module *PTXGenerator::getModule() { |
| return Builder.GetInsertBlock()->getParent()->getParent(); |
| } |
| |
| Function *PTXGenerator::createSubfunctionDefinition(int NumArgs) { |
| assert(NumArgs == 1 && "we support only one array access now."); |
| |
| Module *M = getModule(); |
| Function *F = Builder.GetInsertBlock()->getParent(); |
| std::vector<Type *> Arguments; |
| for (int i = 0; i < NumArgs; i++) |
| Arguments.push_back(Builder.getInt8PtrTy()); |
| FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false); |
| Function *FN = Function::Create(FT, Function::InternalLinkage, |
| F->getName() + "_ptx_subfn", M); |
| FN->setCallingConv(CallingConv::PTX_Kernel); |
| |
| // Do not run any optimization pass on the new function. |
| P->getAnalysis<polly::ScopDetection>().markFunctionAsInvalid(FN); |
| |
| for (Function::arg_iterator AI = FN->arg_begin(); AI != FN->arg_end(); ++AI) |
| AI->setName("ptx.Array"); |
| |
| return FN; |
| } |
| |
| void PTXGenerator::createSubfunction(SetVector<Value *> &UsedValues, |
| SetVector<Value *> &OriginalIVS, |
| PTXGenerator::ValueToValueMapTy &VMap, |
| Function **SubFunction) { |
| Function *FN = createSubfunctionDefinition(UsedValues.size()); |
| Module *M = getModule(); |
| LLVMContext &Context = FN->getContext(); |
| IntegerType *Ty = Builder.getInt64Ty(); |
| |
| // Store the previous basic block. |
| BasicBlock *PrevBB = Builder.GetInsertBlock(); |
| |
| // Create basic blocks. |
| BasicBlock *HeaderBB = BasicBlock::Create(Context, "ptx.setup", FN); |
| BasicBlock *ExitBB = BasicBlock::Create(Context, "ptx.exit", FN); |
| BasicBlock *BodyBB = BasicBlock::Create(Context, "ptx.loop_body", FN); |
| |
| DominatorTree &DT = P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
| DT.addNewBlock(HeaderBB, PrevBB); |
| DT.addNewBlock(ExitBB, HeaderBB); |
| DT.addNewBlock(BodyBB, HeaderBB); |
| |
| Builder.SetInsertPoint(HeaderBB); |
| |
| // Insert VMap items with maps of array base address on the host to base |
| // address on the device. |
| Function::arg_iterator AI = FN->arg_begin(); |
| for (unsigned j = 0; j < UsedValues.size(); j++) { |
| Value *BaseAddr = UsedValues[j]; |
| Type *ArrayTy = BaseAddr->getType(); |
| Value *Param = Builder.CreateBitCast(AI, ArrayTy); |
| VMap.insert(std::make_pair(BaseAddr, Param)); |
| AI++; |
| } |
| |
| // FIXME: These intrinsics should be inserted on-demand. However, we insert |
| // them all currently for simplicity. |
| Function *GetNctaidX = |
| Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_x); |
| Function *GetNctaidY = |
| Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_y); |
| Function *GetCtaidX = |
| Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_x); |
| Function *GetCtaidY = |
| Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_y); |
| Function *GetNtidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_x); |
| Function *GetNtidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_y); |
| Function *GetTidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_x); |
| Function *GetTidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_y); |
| |
| Value *GridWidth = Builder.CreateCall(GetNctaidX); |
| GridWidth = Builder.CreateIntCast(GridWidth, Ty, false); |
| Value *GridHeight = Builder.CreateCall(GetNctaidY); |
| GridHeight = Builder.CreateIntCast(GridHeight, Ty, false); |
| Value *BlockWidth = Builder.CreateCall(GetNtidX); |
| BlockWidth = Builder.CreateIntCast(BlockWidth, Ty, false); |
| Value *BlockHeight = Builder.CreateCall(GetNtidY); |
| BlockHeight = Builder.CreateIntCast(BlockHeight, Ty, false); |
| Value *BIDx = Builder.CreateCall(GetCtaidX); |
| BIDx = Builder.CreateIntCast(BIDx, Ty, false); |
| Value *BIDy = Builder.CreateCall(GetCtaidY); |
| BIDy = Builder.CreateIntCast(BIDy, Ty, false); |
| Value *TIDx = Builder.CreateCall(GetTidX); |
| TIDx = Builder.CreateIntCast(TIDx, Ty, false); |
| Value *TIDy = Builder.CreateCall(GetTidY); |
| TIDy = Builder.CreateIntCast(TIDy, Ty, false); |
| |
| Builder.CreateBr(BodyBB); |
| Builder.SetInsertPoint(BodyBB); |
| |
| unsigned NumDims = OriginalIVS.size(); |
| std::vector<Value *> Substitutions; |
| Value *BlockID, *ThreadID; |
| switch (NumDims) { |
| case 1: { |
| Value *BlockSize = |
| Builder.CreateMul(BlockWidth, BlockHeight, "p_gpu_blocksize"); |
| BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i"); |
| BlockID = Builder.CreateAdd(BlockID, BIDx); |
| BlockID = Builder.CreateMul(BlockID, BlockSize); |
| ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j"); |
| ThreadID = Builder.CreateAdd(ThreadID, TIDx); |
| ThreadID = Builder.CreateAdd(ThreadID, BlockID); |
| Substitutions.push_back(ThreadID); |
| break; |
| } |
| case 2: { |
| BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i"); |
| BlockID = Builder.CreateAdd(BlockID, BIDx); |
| Substitutions.push_back(BlockID); |
| ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j"); |
| ThreadID = Builder.CreateAdd(ThreadID, TIDx); |
| Substitutions.push_back(ThreadID); |
| break; |
| } |
| case 3: { |
| BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i"); |
| BlockID = Builder.CreateAdd(BlockID, BIDx); |
| Substitutions.push_back(BlockID); |
| Substitutions.push_back(TIDy); |
| Substitutions.push_back(TIDx); |
| break; |
| } |
| case 4: { |
| Substitutions.push_back(BIDy); |
| Substitutions.push_back(BIDx); |
| Substitutions.push_back(TIDy); |
| Substitutions.push_back(TIDx); |
| break; |
| } |
| default: |
| assert(true && |
| "We cannot transform parallel loops whose depth is larger than 4."); |
| return; |
| } |
| |
| assert(OriginalIVS.size() == Substitutions.size() && |
| "The size of IVS should be equal to the size of substitutions."); |
| for (unsigned i = 0; i < OriginalIVS.size(); ++i) { |
| VMap.insert(std::make_pair(OriginalIVS[i], Substitutions[i])); |
| } |
| |
| Builder.CreateBr(ExitBB); |
| Builder.SetInsertPoint(--Builder.GetInsertPoint()); |
| BasicBlock::iterator LoopBody = Builder.GetInsertPoint(); |
| |
| // Add the termination of the ptx-device subfunction. |
| Builder.SetInsertPoint(ExitBB); |
| Builder.CreateRetVoid(); |
| |
| Builder.SetInsertPoint(LoopBody); |
| *SubFunction = FN; |
| } |
| |
| void PTXGenerator::startGeneration(SetVector<Value *> &UsedValues, |
| SetVector<Value *> &OriginalIVS, |
| ValueToValueMapTy &VMap, |
| BasicBlock::iterator *LoopBody) { |
| Function *SubFunction; |
| BasicBlock::iterator PrevInsertPoint = Builder.GetInsertPoint(); |
| createSubfunction(UsedValues, OriginalIVS, VMap, &SubFunction); |
| *LoopBody = Builder.GetInsertPoint(); |
| Builder.SetInsertPoint(PrevInsertPoint); |
| } |
| |
| IntegerType *PTXGenerator::getInt64Type() { return Builder.getInt64Ty(); } |
| |
| PointerType *PTXGenerator::getI8PtrType() { |
| return PointerType::getUnqual(Builder.getInt8Ty()); |
| } |
| |
| PointerType *PTXGenerator::getPtrI8PtrType() { |
| return PointerType::getUnqual(getI8PtrType()); |
| } |
| |
| PointerType *PTXGenerator::getFloatPtrType() { |
| return llvm::Type::getFloatPtrTy(getModule()->getContext()); |
| } |
| |
| PointerType *PTXGenerator::getGPUContextPtrType() { |
| return PointerType::getUnqual(ContextTy); |
| } |
| |
| PointerType *PTXGenerator::getGPUModulePtrType() { |
| return PointerType::getUnqual(ModuleTy); |
| } |
| |
| PointerType *PTXGenerator::getGPUDevicePtrType() { |
| return PointerType::getUnqual(DeviceTy); |
| } |
| |
| PointerType *PTXGenerator::getPtrGPUDevicePtrType() { |
| return PointerType::getUnqual(DevDataTy); |
| } |
| |
| PointerType *PTXGenerator::getGPUFunctionPtrType() { |
| return PointerType::getUnqual(KernelTy); |
| } |
| |
| PointerType *PTXGenerator::getGPUEventPtrType() { |
| return PointerType::getUnqual(EventTy); |
| } |
| |
| void PTXGenerator::InitializeGPUDataTypes() { |
| LLVMContext &Context = getModule()->getContext(); |
| |
| ContextTy = StructType::create(Context, "struct.PollyGPUContextT"); |
| ModuleTy = StructType::create(Context, "struct.PollyGPUModuleT"); |
| KernelTy = StructType::create(Context, "struct.PollyGPUFunctionT"); |
| DeviceTy = StructType::create(Context, "struct.PollyGPUDeviceT"); |
| DevDataTy = StructType::create(Context, "struct.PollyGPUDevicePtrT"); |
| EventTy = StructType::create(Context, "struct.PollyGPUEventT"); |
| } |
| |
| void PTXGenerator::createCallInitDevice(Value *Context, Value *Device) { |
| const char *Name = "polly_initDevice"; |
| Module *M = getModule(); |
| Function *F = M->getFunction(Name); |
| |
| // If F is not available, declare it. |
| if (!F) { |
| GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; |
| std::vector<Type *> Args; |
| Args.push_back(PointerType::getUnqual(getGPUContextPtrType())); |
| Args.push_back(PointerType::getUnqual(getGPUDevicePtrType())); |
| FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); |
| F = Function::Create(Ty, Linkage, Name, M); |
| } |
| |
| Builder.CreateCall2(F, Context, Device); |
| } |
| |
| void PTXGenerator::createCallGetPTXModule(Value *Buffer, Value *Module) { |
| const char *Name = "polly_getPTXModule"; |
| llvm::Module *M = getModule(); |
| Function *F = M->getFunction(Name); |
| |
| // If F is not available, declare it. |
| if (!F) { |
| GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; |
| std::vector<Type *> Args; |
| Args.push_back(getI8PtrType()); |
| Args.push_back(PointerType::getUnqual(getGPUModulePtrType())); |
| FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); |
| F = Function::Create(Ty, Linkage, Name, M); |
| } |
| |
| Builder.CreateCall2(F, Buffer, Module); |
| } |
| |
| void PTXGenerator::createCallGetPTXKernelEntry(Value *Entry, Value *Module, |
| Value *Kernel) { |
| const char *Name = "polly_getPTXKernelEntry"; |
| llvm::Module *M = getModule(); |
| Function *F = M->getFunction(Name); |
| |
| // If F is not available, declare it. |
| if (!F) { |
| GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; |
| std::vector<Type *> Args; |
| Args.push_back(getI8PtrType()); |
| Args.push_back(getGPUModulePtrType()); |
| Args.push_back(PointerType::getUnqual(getGPUFunctionPtrType())); |
| FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); |
| F = Function::Create(Ty, Linkage, Name, M); |
| } |
| |
| Builder.CreateCall3(F, Entry, Module, Kernel); |
| } |
| |
| void PTXGenerator::createCallAllocateMemoryForHostAndDevice(Value *HostData, |
| Value *DeviceData, |
| Value *Size) { |
| const char *Name = "polly_allocateMemoryForHostAndDevice"; |
| Module *M = getModule(); |
| Function *F = M->getFunction(Name); |
| |
| // If F is not available, declare it. |
| if (!F) { |
| GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; |
| std::vector<Type *> Args; |
| Args.push_back(getPtrI8PtrType()); |
| Args.push_back(PointerType::getUnqual(getPtrGPUDevicePtrType())); |
| Args.push_back(getInt64Type()); |
| FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); |
| F = Function::Create(Ty, Linkage, Name, M); |
| } |
| |
| Builder.CreateCall3(F, HostData, DeviceData, Size); |
| } |
| |
| void PTXGenerator::createCallCopyFromHostToDevice(Value *DeviceData, |
| Value *HostData, |
| Value *Size) { |
| const char *Name = "polly_copyFromHostToDevice"; |
| Module *M = getModule(); |
| Function *F = M->getFunction(Name); |
| |
| // If F is not available, declare it. |
| if (!F) { |
| GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; |
| std::vector<Type *> Args; |
| Args.push_back(getPtrGPUDevicePtrType()); |
| Args.push_back(getI8PtrType()); |
| Args.push_back(getInt64Type()); |
| FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); |
| F = Function::Create(Ty, Linkage, Name, M); |
| } |
| |
| Builder.CreateCall3(F, DeviceData, HostData, Size); |
| } |
| |
| void PTXGenerator::createCallCopyFromDeviceToHost(Value *HostData, |
| Value *DeviceData, |
| Value *Size) { |
| const char *Name = "polly_copyFromDeviceToHost"; |
| Module *M = getModule(); |
| Function *F = M->getFunction(Name); |
| |
| // If F is not available, declare it. |
| if (!F) { |
| GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; |
| std::vector<Type *> Args; |
| Args.push_back(getI8PtrType()); |
| Args.push_back(getPtrGPUDevicePtrType()); |
| Args.push_back(getInt64Type()); |
| FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); |
| F = Function::Create(Ty, Linkage, Name, M); |
| } |
| |
| Builder.CreateCall3(F, HostData, DeviceData, Size); |
| } |
| |
| void PTXGenerator::createCallSetKernelParameters(Value *Kernel, |
| Value *BlockWidth, |
| Value *BlockHeight, |
| Value *DeviceData) { |
| const char *Name = "polly_setKernelParameters"; |
| Module *M = getModule(); |
| Function *F = M->getFunction(Name); |
| |
| // If F is not available, declare it. |
| if (!F) { |
| GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; |
| std::vector<Type *> Args; |
| Args.push_back(getGPUFunctionPtrType()); |
| Args.push_back(getInt64Type()); |
| Args.push_back(getInt64Type()); |
| Args.push_back(getPtrGPUDevicePtrType()); |
| FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); |
| F = Function::Create(Ty, Linkage, Name, M); |
| } |
| |
| Builder.CreateCall4(F, Kernel, BlockWidth, BlockHeight, DeviceData); |
| } |
| |
| void PTXGenerator::createCallLaunchKernel(Value *Kernel, Value *GridWidth, |
| Value *GridHeight) { |
| const char *Name = "polly_launchKernel"; |
| Module *M = getModule(); |
| Function *F = M->getFunction(Name); |
| |
| // If F is not available, declare it. |
| if (!F) { |
| GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; |
| std::vector<Type *> Args; |
| Args.push_back(getGPUFunctionPtrType()); |
| Args.push_back(getInt64Type()); |
| Args.push_back(getInt64Type()); |
| FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); |
| F = Function::Create(Ty, Linkage, Name, M); |
| } |
| |
| Builder.CreateCall3(F, Kernel, GridWidth, GridHeight); |
| } |
| |
| void PTXGenerator::createCallStartTimerByCudaEvent(Value *StartEvent, |
| Value *StopEvent) { |
| const char *Name = "polly_startTimerByCudaEvent"; |
| Module *M = getModule(); |
| Function *F = M->getFunction(Name); |
| |
| // If F is not available, declare it. |
| if (!F) { |
| GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; |
| std::vector<Type *> Args; |
| Args.push_back(PointerType::getUnqual(getGPUEventPtrType())); |
| Args.push_back(PointerType::getUnqual(getGPUEventPtrType())); |
| FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); |
| F = Function::Create(Ty, Linkage, Name, M); |
| } |
| |
| Builder.CreateCall2(F, StartEvent, StopEvent); |
| } |
| |
| void PTXGenerator::createCallStopTimerByCudaEvent(Value *StartEvent, |
| Value *StopEvent, |
| Value *Timer) { |
| const char *Name = "polly_stopTimerByCudaEvent"; |
| Module *M = getModule(); |
| Function *F = M->getFunction(Name); |
| |
| // If F is not available, declare it. |
| if (!F) { |
| GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; |
| std::vector<Type *> Args; |
| Args.push_back(getGPUEventPtrType()); |
| Args.push_back(getGPUEventPtrType()); |
| Args.push_back(getFloatPtrType()); |
| FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); |
| F = Function::Create(Ty, Linkage, Name, M); |
| } |
| |
| Builder.CreateCall3(F, StartEvent, StopEvent, Timer); |
| } |
| |
| void PTXGenerator::createCallCleanupGPGPUResources(Value *HostData, |
| Value *DeviceData, |
| Value *Module, |
| Value *Context, |
| Value *Kernel) { |
| const char *Name = "polly_cleanupGPGPUResources"; |
| llvm::Module *M = getModule(); |
| Function *F = M->getFunction(Name); |
| |
| // If F is not available, declare it. |
| if (!F) { |
| GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; |
| std::vector<Type *> Args; |
| Args.push_back(getI8PtrType()); |
| Args.push_back(getPtrGPUDevicePtrType()); |
| Args.push_back(getGPUModulePtrType()); |
| Args.push_back(getGPUContextPtrType()); |
| Args.push_back(getGPUFunctionPtrType()); |
| FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); |
| F = Function::Create(Ty, Linkage, Name, M); |
| } |
| |
| Builder.CreateCall5(F, HostData, DeviceData, Module, Context, Kernel); |
| } |
| |
| Value *PTXGenerator::getCUDAGridWidth() { |
| return ConstantInt::get(getInt64Type(), GridWidth); |
| } |
| |
| Value *PTXGenerator::getCUDAGridHeight() { |
| return ConstantInt::get(getInt64Type(), GridHeight); |
| } |
| |
| Value *PTXGenerator::getCUDABlockWidth() { |
| return ConstantInt::get(getInt64Type(), BlockWidth); |
| } |
| |
| Value *PTXGenerator::getCUDABlockHeight() { |
| return ConstantInt::get(getInt64Type(), BlockHeight); |
| } |
| |
| Value *PTXGenerator::getOutputArraySizeInBytes() { |
| return ConstantInt::get(getInt64Type(), OutputBytes); |
| } |
| |
| static Module *extractPTXFunctionsFromModule(const Module *M, |
| const StringRef &Triple) { |
| llvm::ValueToValueMapTy VMap; |
| Module *New = new Module("TempGPUModule", M->getContext()); |
| New->setTargetTriple(Triple::normalize(Triple)); |
| |
| // Loop over the functions in the module, making external functions as before |
| for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) { |
| if (!I->isDeclaration() && |
| (I->getCallingConv() == CallingConv::PTX_Device || |
| I->getCallingConv() == CallingConv::PTX_Kernel)) { |
| Function *NF = |
| Function::Create(cast<FunctionType>(I->getType()->getElementType()), |
| I->getLinkage(), I->getName(), New); |
| NF->copyAttributesFrom(I); |
| VMap[I] = NF; |
| |
| Function::arg_iterator DestI = NF->arg_begin(); |
| for (Function::const_arg_iterator J = I->arg_begin(); J != I->arg_end(); |
| ++J) { |
| DestI->setName(J->getName()); |
| VMap[J] = DestI++; |
| } |
| SmallVector<ReturnInst *, 8> Returns; // Ignore returns cloned. |
| CloneFunctionInto(NF, I, VMap, /*ModuleLevelChanges=*/true, Returns); |
| } |
| } |
| |
| return New; |
| } |
| |
| static bool createASMAsString(Module *New, const StringRef &Triple, |
| const StringRef &MCPU, const StringRef &Features, |
| std::string &ASM) { |
| llvm::Triple TheTriple(Triple::normalize(Triple)); |
| std::string ErrMsg; |
| const Target *TheTarget = |
| TargetRegistry::lookupTarget(TheTriple.getTriple(), ErrMsg); |
| if (!TheTarget) { |
| errs() << ErrMsg << "\n"; |
| return false; |
| } |
| |
| TargetOptions Options; |
| std::unique_ptr<TargetMachine> target(TheTarget->createTargetMachine( |
| TheTriple.getTriple(), MCPU, Features, Options)); |
| assert(target.get() && "Could not allocate target machine!"); |
| TargetMachine &Target = *target.get(); |
| |
| // Build up all of the passes that we want to do to the module. |
| PassManager PM; |
| |
| TargetLibraryInfo *TLI = new TargetLibraryInfo(TheTriple); |
| PM.add(TLI); |
| |
| PM.add(new DataLayoutPass(*Target.getDataLayout())); |
| Target.addAnalysisPasses(PM); |
| |
| { |
| raw_string_ostream NameROS(ASM); |
| formatted_raw_ostream FOS(NameROS); |
| |
| // Ask the target to add backend passes as necessary. |
| int UseVerifier = true; |
| if (Target.addPassesToEmitFile(PM, FOS, TargetMachine::CGFT_AssemblyFile, |
| UseVerifier)) { |
| errs() << "The target does not support generation of this file type!\n"; |
| return false; |
| } |
| |
| PM.run(*New); |
| FOS.flush(); |
| } |
| |
| return true; |
| } |
| |
| Value *PTXGenerator::createPTXKernelFunction(Function *SubFunction) { |
| Module *M = getModule(); |
| Module *GPUModule = extractPTXFunctionsFromModule(M, GPUTriple); |
| std::string LLVMKernelStr; |
| if (!createASMAsString(GPUModule, GPUTriple, "sm_20" /*MCPU*/, |
| "" /*Features*/, LLVMKernelStr)) { |
| errs() << "Generate ptx string failed!\n"; |
| return NULL; |
| } |
| |
| Value *LLVMKernel = |
| Builder.CreateGlobalStringPtr(LLVMKernelStr, "llvm_kernel"); |
| |
| delete GPUModule; |
| return LLVMKernel; |
| } |
| |
| Value *PTXGenerator::getPTXKernelEntryName(Function *SubFunction) { |
| StringRef Entry = SubFunction->getName(); |
| return Builder.CreateGlobalStringPtr(Entry, "ptx_entry"); |
| } |
| |
| void PTXGenerator::eraseUnusedFunctions(Function *SubFunction) { |
| Module *M = getModule(); |
| SubFunction->eraseFromParent(); |
| |
| if (Function *FuncPTXReadNCtaidX = M->getFunction("llvm.ptx.read.nctaid.x")) { |
| FuncPTXReadNCtaidX->eraseFromParent(); |
| } |
| |
| if (Function *FuncPTXReadNCtaidY = M->getFunction("llvm.ptx.read.nctaid.y")) { |
| FuncPTXReadNCtaidY->eraseFromParent(); |
| } |
| |
| if (Function *FuncPTXReadCtaidX = M->getFunction("llvm.ptx.read.ctaid.x")) { |
| FuncPTXReadCtaidX->eraseFromParent(); |
| } |
| |
| if (Function *FuncPTXReadCtaidY = M->getFunction("llvm.ptx.read.ctaid.y")) { |
| FuncPTXReadCtaidY->eraseFromParent(); |
| } |
| |
| if (Function *FuncPTXReadNTidX = M->getFunction("llvm.ptx.read.ntid.x")) { |
| FuncPTXReadNTidX->eraseFromParent(); |
| } |
| |
| if (Function *FuncPTXReadNTidY = M->getFunction("llvm.ptx.read.ntid.y")) { |
| FuncPTXReadNTidY->eraseFromParent(); |
| } |
| |
| if (Function *FuncPTXReadTidX = M->getFunction("llvm.ptx.read.tid.x")) { |
| FuncPTXReadTidX->eraseFromParent(); |
| } |
| |
| if (Function *FuncPTXReadTidY = M->getFunction("llvm.ptx.read.tid.y")) { |
| FuncPTXReadTidY->eraseFromParent(); |
| } |
| } |
| |
| void PTXGenerator::finishGeneration(Function *F) { |
| // Define data used by the GPURuntime library. |
| AllocaInst *PtrCUContext = |
| Builder.CreateAlloca(getGPUContextPtrType(), 0, "phcontext"); |
| AllocaInst *PtrCUDevice = |
| Builder.CreateAlloca(getGPUDevicePtrType(), 0, "phdevice"); |
| AllocaInst *PtrCUModule = |
| Builder.CreateAlloca(getGPUModulePtrType(), 0, "phmodule"); |
| AllocaInst *PtrCUKernel = |
| Builder.CreateAlloca(getGPUFunctionPtrType(), 0, "phkernel"); |
| AllocaInst *PtrCUStartEvent = |
| Builder.CreateAlloca(getGPUEventPtrType(), 0, "pstart_timer"); |
| AllocaInst *PtrCUStopEvent = |
| Builder.CreateAlloca(getGPUEventPtrType(), 0, "pstop_timer"); |
| AllocaInst *PtrDevData = |
| Builder.CreateAlloca(getPtrGPUDevicePtrType(), 0, "pdevice_data"); |
| AllocaInst *PtrHostData = |
| Builder.CreateAlloca(getI8PtrType(), 0, "phost_data"); |
| Type *FloatTy = llvm::Type::getFloatTy(getModule()->getContext()); |
| AllocaInst *PtrElapsedTimes = Builder.CreateAlloca(FloatTy, 0, "ptimer"); |
| |
| // Initialize the GPU device. |
| createCallInitDevice(PtrCUContext, PtrCUDevice); |
| |
| // Create the GPU kernel module and entry function. |
| Value *PTXString = createPTXKernelFunction(F); |
| Value *PTXEntry = getPTXKernelEntryName(F); |
| createCallGetPTXModule(PTXString, PtrCUModule); |
| LoadInst *CUModule = Builder.CreateLoad(PtrCUModule, "cumodule"); |
| createCallGetPTXKernelEntry(PTXEntry, CUModule, PtrCUKernel); |
| |
| // Allocate device memory and its corresponding host memory. |
| createCallAllocateMemoryForHostAndDevice(PtrHostData, PtrDevData, |
| getOutputArraySizeInBytes()); |
| |
| // Get the pointer to the device memory and set the GPU execution parameters. |
| LoadInst *DData = Builder.CreateLoad(PtrDevData, "device_data"); |
| LoadInst *CUKernel = Builder.CreateLoad(PtrCUKernel, "cukernel"); |
| createCallSetKernelParameters(CUKernel, getCUDABlockWidth(), |
| getCUDABlockHeight(), DData); |
| |
| // Create the start and end timer and record the start time. |
| createCallStartTimerByCudaEvent(PtrCUStartEvent, PtrCUStopEvent); |
| |
| // Launch the GPU kernel. |
| createCallLaunchKernel(CUKernel, getCUDAGridWidth(), getCUDAGridHeight()); |
| |
| // Copy the results back from the GPU to the host. |
| LoadInst *HData = Builder.CreateLoad(PtrHostData, "host_data"); |
| createCallCopyFromDeviceToHost(HData, DData, getOutputArraySizeInBytes()); |
| |
| // Record the end time. |
| LoadInst *CUStartEvent = Builder.CreateLoad(PtrCUStartEvent, "start_timer"); |
| LoadInst *CUStopEvent = Builder.CreateLoad(PtrCUStopEvent, "stop_timer"); |
| createCallStopTimerByCudaEvent(CUStartEvent, CUStopEvent, PtrElapsedTimes); |
| |
| // Cleanup all the resources used. |
| LoadInst *CUContext = Builder.CreateLoad(PtrCUContext, "cucontext"); |
| createCallCleanupGPGPUResources(HData, DData, CUModule, CUContext, CUKernel); |
| |
| // Erase the ptx kernel and device subfunctions and ptx intrinsics from |
| // current module. |
| eraseUnusedFunctions(F); |
| } |
| #endif /* GPU_CODEGEN */ |