[NVPTX] Improve kernel byval parameter lowering (#136008)
This change introduces a new pattern for lowering kernel byval
parameters in `NVPTXLowerArgs`. Each byval argument is wrapped in a call
to a new intrinsic, `@llvm.nvvm.internal.addrspace.wrap`. This intrinsic
explicitly equates to no instructions and is removed during operation
legalization in SDAG. However, it allows us to change the addrspace of
the arguments to 101 to reflect the fact that they will occupy this
space when lowered by `LowerFormalArgs` in `NVPTXISelLowering`.
Optionally, if a generic pointer to a param is needed, a standard
`addrspacecast` is used. This approach offers several advantages:
- Exposes addrspace optimizations: By using a standard `addrspacecast`
back to generic space we allow InferAS to optimize this instruction,
potentially sinking it through control flow or in other ways unsupported
by `NVPTXLowerArgs`. This is demonstrated in several existing tests.
- Clearer, more consistent semantics: Previously an `addrspacecast` from
generic to param space was implicitly a no-op. This is problematic
because it's not reciprocal with the inverse cast, violating LLVM
semantics. Further it is very confusing given the existence of
`cvta.to.param`. After this change the cast equates to this instruction.
- Allow for the removal of all nvvm.ptr.* intrinsics: In a follow-up
change the nvvm.ptr.gen.to.param and nvvm.ptr.param.to.gen intrinsics
may be removed.
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 9389504..d09e1da 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1909,6 +1909,22 @@
[IntrNoMem, IntrSpeculatable, IntrNoCallback],
"llvm.nvvm.ptr.param.to.gen">;
+// Represents an explicit hole in the LLVM IR type system. It may be inserted by
+// the compiler in cases where a pointer is of the wrong type. In the backend
+// this intrinsic will be folded away and not equate to any instruction. It
+// should not be used by any frontend and should only be considered well defined
+// when added in the following cases:
+//
+// - NVPTXLowerArgs: When wrapping a byval pointer argument to a kernel
+// function to convert the address space from generic (0) to param (101).
+// This accounts for the fact that the parameter symbols will occupy this
+// space when lowered during ISel.
+//
+def int_nvvm_internal_addrspace_wrap :
+ DefaultAttrsIntrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty],
+ [IntrNoMem, IntrSpeculatable, NoUndef<ArgIndex<0>>,
+ NoUndef<RetIndex>]>;
+
// Move intrinsics, used in nvvm internally
def int_nvvm_move_i16 : Intrinsic<[llvm_i16_ty], [llvm_i16_ty], [IntrNoMem],
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ec1f969..486c7c8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -985,6 +985,9 @@
case ADDRESS_SPACE_LOCAL:
Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
break;
+ case ADDRESS_SPACE_PARAM:
+ Opc = TM.is64Bit() ? NVPTX::cvta_param_64 : NVPTX::cvta_param;
+ break;
}
ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src));
return;
@@ -1008,7 +1011,7 @@
Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
break;
case ADDRESS_SPACE_PARAM:
- Opc = TM.is64Bit() ? NVPTX::IMOV64r : NVPTX::IMOV32r;
+ Opc = TM.is64Bit() ? NVPTX::cvta_to_param_64 : NVPTX::cvta_to_param;
break;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 277a341..49f4f30 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1014,6 +1014,8 @@
{MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
MVT::v32i32, MVT::v64i32, MVT::v128i32},
Custom);
+
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
}
const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -1426,6 +1428,17 @@
return MachinePointerInfo(ADDRESS_SPACE_LOCAL);
}
+
+ // Peel of an addrspacecast to generic and load directly from the specific
+ // address space.
+ if (Ptr->getOpcode() == ISD::ADDRSPACECAST) {
+ const auto *ASC = cast<AddrSpaceCastSDNode>(Ptr);
+ if (ASC->getDestAddressSpace() == ADDRESS_SPACE_GENERIC) {
+ Ptr = ASC->getOperand(0);
+ return MachinePointerInfo(ASC->getSrcAddressSpace());
+ }
+ }
+
return MachinePointerInfo();
}
@@ -2746,6 +2759,15 @@
return Op;
}
+static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG) {
+ switch (Op->getConstantOperandVal(0)) {
+ default:
+ return Op;
+ case Intrinsic::nvvm_internal_addrspace_wrap:
+ return Op.getOperand(1);
+ }
+}
+
// In PTX 64-bit CTLZ and CTPOP are supported, but they return a 32-bit value.
// Lower these into a node returning the correct type which is zero-extended
// back to the correct size.
@@ -2889,6 +2911,8 @@
return LowerGlobalAddress(Op, DAG);
case ISD::INTRINSIC_W_CHAIN:
return Op;
+ case ISD::INTRINSIC_WO_CHAIN:
+ return lowerIntrinsicWOChain(Op, DAG);
case ISD::INTRINSIC_VOID:
return LowerIntrinsicVoid(Op, DAG);
case ISD::BUILD_VECTOR:
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 9634ad8..4ba3e6f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2395,18 +2395,10 @@
"cvta.to." # Str # ".u64 \t$result, $src;", []>;
}
-defm cvta_local : NG_TO_G<"local">;
-defm cvta_shared : NG_TO_G<"shared">;
-defm cvta_global : NG_TO_G<"global">;
-defm cvta_const : NG_TO_G<"const">;
-
-defm cvta_to_local : G_TO_NG<"local">;
-defm cvta_to_shared : G_TO_NG<"shared">;
-defm cvta_to_global : G_TO_NG<"global">;
-defm cvta_to_const : G_TO_NG<"const">;
-
-// nvvm.ptr.param.to.gen
-defm cvta_param : NG_TO_G<"param">;
+foreach space = ["local", "shared", "global", "const", "param"] in {
+ defm cvta_#space : NG_TO_G<space>;
+ defm cvta_to_#space : G_TO_NG<space>;
+}
def : Pat<(int_nvvm_ptr_param_to_gen i32:$src),
(cvta_param $src)>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index ee835db..bd396cd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -265,18 +265,9 @@
if (HasCvtaParam) {
auto GetParamAddrCastToGeneric =
[](Value *Addr, Instruction *OriginalUser) -> Value * {
- PointerType *ReturnTy =
- PointerType::get(OriginalUser->getContext(), ADDRESS_SPACE_GENERIC);
- Function *CvtToGen = Intrinsic::getOrInsertDeclaration(
- OriginalUser->getModule(), Intrinsic::nvvm_ptr_param_to_gen,
- {ReturnTy, PointerType::get(OriginalUser->getContext(),
- ADDRESS_SPACE_PARAM)});
-
- // Cast param address to generic address space
- Value *CvtToGenCall =
- CallInst::Create(CvtToGen, Addr, Addr->getName() + ".gen",
- OriginalUser->getIterator());
- return CvtToGenCall;
+ IRBuilder<> IRB(OriginalUser);
+ Type *GenTy = IRB.getPtrTy(ADDRESS_SPACE_GENERIC);
+ return IRB.CreateAddrSpaceCast(Addr, GenTy, Addr->getName() + ".gen");
};
auto *ParamInGenericAS =
GetParamAddrCastToGeneric(I.NewParam, I.OldInstruction);
@@ -515,23 +506,24 @@
BasicBlock::iterator FirstInst = F.getEntryBlock().begin();
Type *StructType = Arg.getParamByValType();
const DataLayout &DL = F.getDataLayout();
- AllocaInst *AllocA = new AllocaInst(StructType, DL.getAllocaAddrSpace(),
- Arg.getName(), FirstInst);
+ IRBuilder<> IRB(&*FirstInst);
+ AllocaInst *AllocA = IRB.CreateAlloca(StructType, nullptr, Arg.getName());
// Set the alignment to alignment of the byval parameter. This is because,
// later load/stores assume that alignment, and we are going to replace
// the use of the byval parameter with this alloca instruction.
- AllocA->setAlignment(F.getParamAlign(Arg.getArgNo())
- .value_or(DL.getPrefTypeAlign(StructType)));
+ AllocA->setAlignment(
+ Arg.getParamAlign().value_or(DL.getPrefTypeAlign(StructType)));
Arg.replaceAllUsesWith(AllocA);
- Value *ArgInParam = new AddrSpaceCastInst(
- &Arg, PointerType::get(Arg.getContext(), ADDRESS_SPACE_PARAM),
- Arg.getName(), FirstInst);
+ Value *ArgInParam =
+ IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap,
+ {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg.getType()},
+ &Arg, {}, Arg.getName());
+
// Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
// addrspacecast preserves alignment. Since params are constant, this load
// is definitely not volatile.
const auto ArgSize = *AllocA->getAllocationSize(DL);
- IRBuilder<> IRB(&*FirstInst);
IRB.CreateMemCpy(AllocA, AllocA->getAlign(), ArgInParam, AllocA->getAlign(),
ArgSize);
}
@@ -539,9 +531,9 @@
static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) {
Function *Func = Arg->getParent();
- bool HasCvtaParam =
- TM.getSubtargetImpl(*Func)->hasCvtaParam() && isKernelFunction(*Func);
- bool IsGridConstant = HasCvtaParam && isParamGridConstant(*Arg);
+ assert(isKernelFunction(*Func));
+ const bool HasCvtaParam = TM.getSubtargetImpl(*Func)->hasCvtaParam();
+ const bool IsGridConstant = HasCvtaParam && isParamGridConstant(*Arg);
const DataLayout &DL = Func->getDataLayout();
BasicBlock::iterator FirstInst = Func->getEntryBlock().begin();
Type *StructType = Arg->getParamByValType();
@@ -556,9 +548,11 @@
// skip creation of a local copy of the argument.
SmallVector<Use *, 16> UsesToUpdate(llvm::make_pointer_range(Arg->uses()));
- Value *ArgInParamAS = new AddrSpaceCastInst(
- Arg, PointerType::get(StructType->getContext(), ADDRESS_SPACE_PARAM),
- Arg->getName(), FirstInst);
+ IRBuilder<> IRB(&*FirstInst);
+ Value *ArgInParamAS = IRB.CreateIntrinsic(
+ Intrinsic::nvvm_internal_addrspace_wrap,
+ {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getType()}, {Arg});
+
for (Use *U : UsesToUpdate)
convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant);
LLVM_DEBUG(dbgs() << "No need to copy or cast " << *Arg << "\n");
@@ -576,30 +570,31 @@
// However, we're still not allowed to write to it. If the user specified
// `__grid_constant__` for the argument, we'll consider escaped pointer as
// read-only.
- if (HasCvtaParam && (ArgUseIsReadOnly || IsGridConstant)) {
+ if (IsGridConstant || (HasCvtaParam && ArgUseIsReadOnly)) {
LLVM_DEBUG(dbgs() << "Using non-copy pointer to " << *Arg << "\n");
// Replace all argument pointer uses (which might include a device function
// call) with a cast to the generic address space using cvta.param
// instruction, which avoids a local copy.
IRBuilder<> IRB(&Func->getEntryBlock().front());
- // Cast argument to param address space
- auto *CastToParam = cast<AddrSpaceCastInst>(IRB.CreateAddrSpaceCast(
- Arg, IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getName() + ".param"));
+ // Cast argument to param address space. Because the backend will emit the
+ // argument already in the param address space, we need to use the noop
+ // intrinsic, this had the added benefit of preventing other optimizations
+ // from folding away this pair of addrspacecasts.
+ auto *ParamSpaceArg =
+ IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap,
+ {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getType()},
+ Arg, {}, Arg->getName() + ".param");
- // Cast param address to generic address space. We do not use an
- // addrspacecast to generic here, because, LLVM considers `Arg` to be in the
- // generic address space, and a `generic -> param` cast followed by a `param
- // -> generic` cast will be folded away. The `param -> generic` intrinsic
- // will be correctly lowered to `cvta.param`.
- Value *CvtToGenCall = IRB.CreateIntrinsic(
- IRB.getPtrTy(ADDRESS_SPACE_GENERIC), Intrinsic::nvvm_ptr_param_to_gen,
- CastToParam, nullptr, CastToParam->getName() + ".gen");
+ // Cast param address to generic address space.
+ Value *GenericArg = IRB.CreateAddrSpaceCast(
+ ParamSpaceArg, IRB.getPtrTy(ADDRESS_SPACE_GENERIC),
+ Arg->getName() + ".gen");
- Arg->replaceAllUsesWith(CvtToGenCall);
+ Arg->replaceAllUsesWith(GenericArg);
// Do not replace Arg in the cast to param space
- CastToParam->setOperand(0, Arg);
+ ParamSpaceArg->setOperand(0, Arg);
} else
copyByValParam(*Func, *Arg);
}
@@ -713,12 +708,14 @@
LLVM_DEBUG(dbgs() << "Creating a copy of byval args of " << F.getName()
<< "\n");
bool Changed = false;
- for (Argument &Arg : F.args())
- if (Arg.getType()->isPointerTy() && Arg.hasByValAttr() &&
- !(isParamGridConstant(Arg) && isKernelFunction(F))) {
- copyByValParam(F, Arg);
- Changed = true;
- }
+ if (isKernelFunction(F)) {
+ for (Argument &Arg : F.args())
+ if (Arg.getType()->isPointerTy() && Arg.hasByValAttr() &&
+ !isParamGridConstant(Arg)) {
+ copyByValParam(F, Arg);
+ Changed = true;
+ }
+ }
return Changed;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index b800445a..61b50b6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -16,11 +16,13 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Argument.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/Alignment.h"
+#include "llvm/Support/ModRef.h"
#include "llvm/Support/Mutex.h"
#include <cstdint>
#include <cstring>
@@ -228,17 +230,30 @@
return std::accumulate(V.begin(), V.end(), 1, std::multiplies<uint64_t>{});
}
-bool isParamGridConstant(const Value &V) {
- if (const Argument *Arg = dyn_cast<Argument>(&V)) {
- // "grid_constant" counts argument indices starting from 1
- if (Arg->hasByValAttr() &&
- argHasNVVMAnnotation(*Arg, "grid_constant",
- /*StartArgIndexAtOne*/ true)) {
- assert(isKernelFunction(*Arg->getParent()) &&
- "only kernel arguments can be grid_constant");
+bool isParamGridConstant(const Argument &Arg) {
+ assert(isKernelFunction(*Arg.getParent()) &&
+ "only kernel arguments can be grid_constant");
+
+ if (!Arg.hasByValAttr())
+ return false;
+
+ // Lowering an argument as a grid_constant violates the byval semantics (and
+ // the C++ API) by reusing the same memory location for the argument across
+ // multiple threads. If an argument doesn't read memory and its address is not
+ // captured (its address is not compared with any value), then the tweak of
+ // the C++ API and byval semantics is unobservable by the program and we can
+ // lower the arg as a grid_constant.
+ if (Arg.onlyReadsMemory()) {
+ const auto CI = Arg.getAttributes().getCaptureInfo();
+ if (!capturesAddress(CI) && !capturesFullProvenance(CI))
return true;
- }
}
+
+ // "grid_constant" counts argument indices starting from 1
+ if (argHasNVVMAnnotation(Arg, "grid_constant",
+ /*StartArgIndexAtOne*/ true))
+ return true;
+
return false;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index 2288241..70bf020 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -63,7 +63,7 @@
return F.getCallingConv() == CallingConv::PTX_Kernel;
}
-bool isParamGridConstant(const Value &);
+bool isParamGridConstant(const Argument &);
inline MaybeAlign getAlign(const Function &F, unsigned Index) {
return F.getAttributes().getAttributes(Index).getStackAlignment();
diff --git a/llvm/test/CodeGen/NVPTX/bug21465.ll b/llvm/test/CodeGen/NVPTX/bug21465.ll
index 76300e3..33c6dbd 100644
--- a/llvm/test/CodeGen/NVPTX/bug21465.ll
+++ b/llvm/test/CodeGen/NVPTX/bug21465.ll
@@ -12,7 +12,7 @@
entry:
; CHECK-LABEL: @_Z11TakesStruct1SPi
; PTX-LABEL: .visible .entry _Z11TakesStruct1SPi(
-; CHECK: addrspacecast ptr %input to ptr addrspace(101)
+; CHECK: call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr %input)
%b = getelementptr inbounds %struct.S, ptr %input, i64 0, i32 1
%0 = load i32, ptr %b, align 4
; PTX-NOT: ld.param.u32 {{%r[0-9]+}}, [{{%rd[0-9]+}}]
diff --git a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
index 6d9710e..80ae8aa 100644
--- a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
+++ b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
@@ -65,7 +65,7 @@
; CHECK-LABEL: test_ld_param_byval(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<2>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test_ld_param_byval_param_0];
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index 836a7d7..dd172cf 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -72,7 +72,7 @@
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_int(
; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT11:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; OPT-NEXT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4
; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]]
; OPT-NEXT: store i32 [[ADD]], ptr [[OUT]], align 4
@@ -101,7 +101,7 @@
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_struct(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0
; OPT-NEXT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1
; OPT-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4
@@ -122,16 +122,15 @@
; PTX-LABEL: grid_const_escape(
; PTX: {
; PTX-NEXT: .reg .b32 %r<3>;
-; PTX-NEXT: .reg .b64 %rd<5>;
+; PTX-NEXT: .reg .b64 %rd<4>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd2, grid_const_escape_param_0;
-; PTX-NEXT: mov.b64 %rd3, %rd2;
-; PTX-NEXT: cvta.param.u64 %rd4, %rd3;
+; PTX-NEXT: cvta.param.u64 %rd3, %rd2;
; PTX-NEXT: mov.b64 %rd1, escape;
; PTX-NEXT: { // callseq 0, 0
; PTX-NEXT: .param .b64 param0;
-; PTX-NEXT: st.param.b64 [param0], %rd4;
+; PTX-NEXT: st.param.b64 [param0], %rd3;
; PTX-NEXT: .param .b32 retval0;
; PTX-NEXT: prototype_0 : .callprototype (.param .b32 _) _ (.param .b64 _);
; PTX-NEXT: call (retval0),
@@ -145,8 +144,8 @@
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
; OPT-NEXT: ret void
%call = call i32 @escape(ptr %input)
@@ -160,29 +159,27 @@
; PTX-NEXT: .reg .b64 %SP;
; PTX-NEXT: .reg .b64 %SPL;
; PTX-NEXT: .reg .b32 %r<4>;
-; PTX-NEXT: .reg .b64 %rd<10>;
+; PTX-NEXT: .reg .b64 %rd<8>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %SPL, __local_depot4;
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
; PTX-NEXT: mov.b64 %rd2, multiple_grid_const_escape_param_0;
-; PTX-NEXT: mov.b64 %rd3, multiple_grid_const_escape_param_2;
-; PTX-NEXT: mov.b64 %rd4, %rd3;
; PTX-NEXT: ld.param.u32 %r1, [multiple_grid_const_escape_param_1];
-; PTX-NEXT: cvta.param.u64 %rd5, %rd4;
-; PTX-NEXT: mov.b64 %rd6, %rd2;
-; PTX-NEXT: cvta.param.u64 %rd7, %rd6;
-; PTX-NEXT: add.u64 %rd8, %SP, 0;
-; PTX-NEXT: add.u64 %rd9, %SPL, 0;
-; PTX-NEXT: st.local.u32 [%rd9], %r1;
+; PTX-NEXT: mov.b64 %rd3, multiple_grid_const_escape_param_2;
+; PTX-NEXT: cvta.param.u64 %rd4, %rd3;
+; PTX-NEXT: cvta.param.u64 %rd5, %rd2;
+; PTX-NEXT: add.u64 %rd6, %SP, 0;
+; PTX-NEXT: add.u64 %rd7, %SPL, 0;
+; PTX-NEXT: st.local.u32 [%rd7], %r1;
; PTX-NEXT: mov.b64 %rd1, escape3;
; PTX-NEXT: { // callseq 1, 0
; PTX-NEXT: .param .b64 param0;
-; PTX-NEXT: st.param.b64 [param0], %rd7;
+; PTX-NEXT: st.param.b64 [param0], %rd5;
; PTX-NEXT: .param .b64 param1;
-; PTX-NEXT: st.param.b64 [param1], %rd8;
+; PTX-NEXT: st.param.b64 [param1], %rd6;
; PTX-NEXT: .param .b64 param2;
-; PTX-NEXT: st.param.b64 [param2], %rd5;
+; PTX-NEXT: st.param.b64 [param2], %rd4;
; PTX-NEXT: .param .b32 retval0;
; PTX-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .b64 _, .param .b64 _, .param .b64 _);
; PTX-NEXT: call (retval0),
@@ -198,10 +195,10 @@
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @multiple_grid_const_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
-; OPT-NEXT: [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]])
-; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[B]])
+; OPT-NEXT: [[B_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
+; OPT-NEXT: [[TMP2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr
; OPT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
; OPT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
; OPT-NEXT: [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]])
@@ -215,20 +212,19 @@
define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) {
; PTX-LABEL: grid_const_memory_escape(
; PTX: {
-; PTX-NEXT: .reg .b64 %rd<6>;
+; PTX-NEXT: .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd1, grid_const_memory_escape_param_0;
; PTX-NEXT: ld.param.u64 %rd2, [grid_const_memory_escape_param_1];
; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
-; PTX-NEXT: mov.b64 %rd4, %rd1;
-; PTX-NEXT: cvta.param.u64 %rd5, %rd4;
-; PTX-NEXT: st.global.u64 [%rd3], %rd5;
+; PTX-NEXT: cvta.param.u64 %rd4, %rd1;
+; PTX-NEXT: st.global.u64 [%rd3], %rd4;
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_memory_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR]], align 8
; OPT-NEXT: ret void
store ptr %input, ptr %addr, align 8
@@ -238,14 +234,13 @@
define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) {
; PTX-LABEL: grid_const_inlineasm_escape(
; PTX: {
-; PTX-NEXT: .reg .b64 %rd<8>;
+; PTX-NEXT: .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd4, grid_const_inlineasm_escape_param_0;
; PTX-NEXT: ld.param.u64 %rd5, [grid_const_inlineasm_escape_param_1];
; PTX-NEXT: cvta.to.global.u64 %rd6, %rd5;
-; PTX-NEXT: mov.b64 %rd7, %rd4;
-; PTX-NEXT: cvta.param.u64 %rd2, %rd7;
+; PTX-NEXT: cvta.param.u64 %rd2, %rd4;
; PTX-NEXT: add.s64 %rd3, %rd2, 4;
; PTX-NEXT: // begin inline asm
; PTX-NEXT: add.s64 %rd1, %rd2, %rd3;
@@ -255,8 +250,8 @@
; PTX-NOT .local
; OPT-LABEL: define ptx_kernel void @grid_const_inlineasm_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
; OPT-NEXT: [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
; OPT-NEXT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2
@@ -273,21 +268,20 @@
; PTX-LABEL: grid_const_partial_escape(
; PTX: {
; PTX-NEXT: .reg .b32 %r<5>;
-; PTX-NEXT: .reg .b64 %rd<7>;
+; PTX-NEXT: .reg .b64 %rd<6>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd2, grid_const_partial_escape_param_0;
; PTX-NEXT: ld.param.u64 %rd3, [grid_const_partial_escape_param_1];
; PTX-NEXT: cvta.to.global.u64 %rd4, %rd3;
-; PTX-NEXT: mov.b64 %rd5, %rd2;
-; PTX-NEXT: cvta.param.u64 %rd6, %rd5;
-; PTX-NEXT: ld.u32 %r1, [%rd6];
+; PTX-NEXT: cvta.param.u64 %rd5, %rd2;
+; PTX-NEXT: ld.param.u32 %r1, [grid_const_partial_escape_param_0];
; PTX-NEXT: add.s32 %r2, %r1, %r1;
; PTX-NEXT: st.global.u32 [%rd4], %r2;
; PTX-NEXT: mov.b64 %rd1, escape;
; PTX-NEXT: { // callseq 2, 0
; PTX-NEXT: .param .b64 param0;
-; PTX-NEXT: st.param.b64 [param0], %rd6;
+; PTX-NEXT: st.param.b64 [param0], %rd5;
; PTX-NEXT: .param .b32 retval0;
; PTX-NEXT: prototype_2 : .callprototype (.param .b32 _) _ (.param .b64 _);
; PTX-NEXT: call (retval0),
@@ -301,8 +295,8 @@
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_partial_escape(
; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]])
+; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; OPT-NEXT: [[INPUT1_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[INPUT1_GEN]], align 4
; OPT-NEXT: [[TWICE:%.*]] = add i32 [[VAL1]], [[VAL1]]
; OPT-NEXT: store i32 [[TWICE]], ptr [[OUTPUT]], align 4
@@ -319,22 +313,21 @@
; PTX-LABEL: grid_const_partial_escapemem(
; PTX: {
; PTX-NEXT: .reg .b32 %r<6>;
-; PTX-NEXT: .reg .b64 %rd<7>;
+; PTX-NEXT: .reg .b64 %rd<6>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd2, grid_const_partial_escapemem_param_0;
; PTX-NEXT: ld.param.u64 %rd3, [grid_const_partial_escapemem_param_1];
; PTX-NEXT: cvta.to.global.u64 %rd4, %rd3;
-; PTX-NEXT: mov.b64 %rd5, %rd2;
-; PTX-NEXT: cvta.param.u64 %rd6, %rd5;
-; PTX-NEXT: ld.u32 %r1, [%rd6];
-; PTX-NEXT: ld.u32 %r2, [%rd6+4];
-; PTX-NEXT: st.global.u64 [%rd4], %rd6;
+; PTX-NEXT: cvta.param.u64 %rd5, %rd2;
+; PTX-NEXT: ld.param.u32 %r1, [grid_const_partial_escapemem_param_0];
+; PTX-NEXT: ld.param.u32 %r2, [grid_const_partial_escapemem_param_0+4];
+; PTX-NEXT: st.global.u64 [%rd4], %rd5;
; PTX-NEXT: add.s32 %r3, %r1, %r2;
; PTX-NEXT: mov.b64 %rd1, escape;
; PTX-NEXT: { // callseq 3, 0
; PTX-NEXT: .param .b64 param0;
-; PTX-NEXT: st.param.b64 [param0], %rd6;
+; PTX-NEXT: st.param.b64 [param0], %rd5;
; PTX-NEXT: .param .b32 retval0;
; PTX-NEXT: prototype_3 : .callprototype (.param .b32 _) _ (.param .b64 _);
; PTX-NEXT: call (retval0),
@@ -349,8 +342,8 @@
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel i32 @grid_const_partial_escapemem(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
+; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[PTR1]], align 4
; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
@@ -374,27 +367,25 @@
; PTX: {
; PTX-NEXT: .reg .pred %p<2>;
; PTX-NEXT: .reg .b32 %r<3>;
-; PTX-NEXT: .reg .b64 %rd<9>;
+; PTX-NEXT: .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
-; PTX-NEXT: mov.b64 %rd5, grid_const_phi_param_0;
-; PTX-NEXT: ld.param.u64 %rd6, [grid_const_phi_param_1];
-; PTX-NEXT: cvta.to.global.u64 %rd1, %rd6;
-; PTX-NEXT: mov.b64 %rd7, %rd5;
-; PTX-NEXT: cvta.param.u64 %rd8, %rd7;
+; PTX-NEXT: mov.b64 %rd6, grid_const_phi_param_0;
+; PTX-NEXT: ld.param.u64 %rd5, [grid_const_phi_param_1];
+; PTX-NEXT: cvta.to.global.u64 %rd1, %rd5;
; PTX-NEXT: ld.global.u32 %r1, [%rd1];
; PTX-NEXT: setp.lt.s32 %p1, %r1, 0;
; PTX-NEXT: @%p1 bra $L__BB9_2;
; PTX-NEXT: // %bb.1: // %second
-; PTX-NEXT: add.s64 %rd8, %rd8, 4;
+; PTX-NEXT: add.s64 %rd6, %rd6, 4;
; PTX-NEXT: $L__BB9_2: // %merge
-; PTX-NEXT: ld.u32 %r2, [%rd8];
+; PTX-NEXT: ld.param.u32 %r2, [%rd6];
; PTX-NEXT: st.global.u32 [%rd1], %r2;
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_phi(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]]
@@ -432,32 +423,28 @@
; PTX: {
; PTX-NEXT: .reg .pred %p<2>;
; PTX-NEXT: .reg .b32 %r<3>;
-; PTX-NEXT: .reg .b64 %rd<12>;
+; PTX-NEXT: .reg .b64 %rd<8>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
-; PTX-NEXT: mov.b64 %rd6, grid_const_phi_ngc_param_0;
-; PTX-NEXT: ld.param.u64 %rd7, [grid_const_phi_ngc_param_2];
-; PTX-NEXT: cvta.to.global.u64 %rd1, %rd7;
-; PTX-NEXT: mov.b64 %rd10, %rd6;
-; PTX-NEXT: cvta.param.u64 %rd11, %rd10;
+; PTX-NEXT: mov.b64 %rd7, grid_const_phi_ngc_param_0;
+; PTX-NEXT: ld.param.u64 %rd6, [grid_const_phi_ngc_param_2];
+; PTX-NEXT: cvta.to.global.u64 %rd1, %rd6;
; PTX-NEXT: ld.global.u32 %r1, [%rd1];
; PTX-NEXT: setp.lt.s32 %p1, %r1, 0;
; PTX-NEXT: @%p1 bra $L__BB10_2;
; PTX-NEXT: // %bb.1: // %second
-; PTX-NEXT: mov.b64 %rd8, grid_const_phi_ngc_param_1;
-; PTX-NEXT: mov.b64 %rd9, %rd8;
-; PTX-NEXT: cvta.param.u64 %rd2, %rd9;
-; PTX-NEXT: add.s64 %rd11, %rd2, 4;
+; PTX-NEXT: mov.b64 %rd2, grid_const_phi_ngc_param_1;
+; PTX-NEXT: add.s64 %rd7, %rd2, 4;
; PTX-NEXT: $L__BB10_2: // %merge
-; PTX-NEXT: ld.u32 %r2, [%rd11];
+; PTX-NEXT: ld.param.u32 %r2, [%rd7];
; PTX-NEXT: st.global.u32 [%rd1], %r2;
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_phi_ngc(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
-; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
+; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
+; OPT-NEXT: [[TMP2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr
; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]]
@@ -494,29 +481,25 @@
; PTX: {
; PTX-NEXT: .reg .pred %p<2>;
; PTX-NEXT: .reg .b32 %r<3>;
-; PTX-NEXT: .reg .b64 %rd<10>;
+; PTX-NEXT: .reg .b64 %rd<6>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd1, grid_const_select_param_0;
; PTX-NEXT: ld.param.u64 %rd2, [grid_const_select_param_2];
; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
; PTX-NEXT: mov.b64 %rd4, grid_const_select_param_1;
-; PTX-NEXT: mov.b64 %rd5, %rd4;
-; PTX-NEXT: cvta.param.u64 %rd6, %rd5;
-; PTX-NEXT: mov.b64 %rd7, %rd1;
-; PTX-NEXT: cvta.param.u64 %rd8, %rd7;
; PTX-NEXT: ld.global.u32 %r1, [%rd3];
; PTX-NEXT: setp.lt.s32 %p1, %r1, 0;
-; PTX-NEXT: selp.b64 %rd9, %rd8, %rd6, %p1;
-; PTX-NEXT: ld.u32 %r2, [%rd9];
+; PTX-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1;
+; PTX-NEXT: ld.param.u32 %r2, [%rd5];
; PTX-NEXT: st.global.u32 [%rd3], %r2;
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_select(
; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
-; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
+; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
+; OPT-NEXT: [[TMP2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr
; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
; OPT-NEXT: [[PTRNEW:%.*]] = select i1 [[LESS]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]]
@@ -535,22 +518,21 @@
; PTX-LABEL: grid_const_ptrtoint(
; PTX: {
; PTX-NEXT: .reg .b32 %r<4>;
-; PTX-NEXT: .reg .b64 %rd<4>;
+; PTX-NEXT: .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd1, grid_const_ptrtoint_param_0;
-; PTX-NEXT: mov.b64 %rd2, %rd1;
; PTX-NEXT: ld.param.u32 %r1, [grid_const_ptrtoint_param_0];
-; PTX-NEXT: cvta.param.u64 %rd3, %rd2;
-; PTX-NEXT: cvt.u32.u64 %r2, %rd3;
+; PTX-NEXT: cvta.param.u64 %rd2, %rd1;
+; PTX-NEXT: cvt.u32.u64 %r2, %rd2;
; PTX-NEXT: add.s32 %r3, %r1, %r2;
; PTX-NEXT: st.param.b32 [func_retval0], %r3;
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel i32 @grid_const_ptrtoint(
; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT3:%.*]] = load i32, ptr addrspace(101) [[INPUT2]], align 4
-; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
+; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[INPUT2]] to ptr
; OPT-NEXT: [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32
; OPT-NEXT: [[KEEPALIVE:%.*]] = add i32 [[INPUT3]], [[PTRVAL]]
; OPT-NEXT: ret i32 [[KEEPALIVE]]
@@ -560,13 +542,42 @@
ret i32 %keepalive
}
+declare void @device_func(ptr byval(i32) align 4)
+
+define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
+; OPT-LABEL: define ptx_kernel void @test_forward_byval_arg(
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[INPUT_PARAM:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[INPUT_PARAM]] to ptr
+; OPT-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT_PARAM_GEN]])
+; OPT-NEXT: ret void
+;
+; PTX-LABEL: test_forward_byval_arg(
+; PTX: {
+; PTX-NEXT: .reg .b32 %r<2>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: ld.param.u32 %r1, [test_forward_byval_arg_param_0];
+; PTX-NEXT: { // callseq 4, 0
+; PTX-NEXT: .param .align 4 .b8 param0[4];
+; PTX-NEXT: st.param.b32 [param0], %r1;
+; PTX-NEXT: call.uni
+; PTX-NEXT: device_func,
+; PTX-NEXT: (
+; PTX-NEXT: param0
+; PTX-NEXT: );
+; PTX-NEXT: } // callseq 4
+; PTX-NEXT: ret;
+ call void @device_func(ptr byval(i32) align 4 %input)
+ ret void
+}
declare dso_local void @dummy() local_unnamed_addr
declare dso_local ptr @escape(ptr) local_unnamed_addr
declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr
-!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23}
+!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24}
!0 = !{ptr @grid_const_int, !"grid_constant", !1}
!1 = !{i32 1}
@@ -604,4 +615,6 @@
!22 = !{ptr @grid_const_ptrtoint, !"grid_constant", !23}
!23 = !{i32 1}
+!24 = !{ptr @test_forward_byval_arg, !"grid_constant", !25}
+!25 = !{i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll
index 8fa7d5c..8e87987 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -210,7 +210,7 @@
define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%struct.S) align 8 %s, i32 noundef %v) {
; IRC-LABEL: define ptx_kernel void @ptr_as_int_aggr(
; IRC-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) {
-; IRC-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; IRC-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; IRC-NEXT: [[I:%.*]] = load i64, ptr addrspace(101) [[S3]], align 8
; IRC-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr
; IRC-NEXT: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
@@ -220,7 +220,7 @@
;
; IRO-LABEL: define ptx_kernel void @ptr_as_int_aggr(
; IRO-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) {
-; IRO-NEXT: [[S1:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; IRO-NEXT: [[S1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; IRO-NEXT: [[I:%.*]] = load i64, ptr addrspace(101) [[S1]], align 8
; IRO-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr
; IRO-NEXT: store i32 [[V]], ptr [[P]], align 4
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index 2d8684c..1304ffe 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -32,7 +32,7 @@
; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only(
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4
; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4
; LOWER-ARGS-NEXT: ret void
@@ -40,10 +40,7 @@
; COPY-LABEL: define dso_local ptx_kernel void @read_only(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; COPY-NEXT: [[ENTRY:.*:]]
-; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; COPY-NEXT: [[I:%.*]] = load i32, ptr [[S1]], align 4
+; COPY-NEXT: [[I:%.*]] = load i32, ptr [[S]], align 4
; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4
; COPY-NEXT: ret void
;
@@ -69,7 +66,7 @@
; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only_gep(
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4
@@ -78,10 +75,7 @@
; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COPY-NEXT: [[ENTRY:.*:]]
-; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
+; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 4
; COPY-NEXT: [[I:%.*]] = load i32, ptr [[B]], align 4
; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4
; COPY-NEXT: ret void
@@ -104,122 +98,51 @@
ret void
}
-; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local ptx_kernel void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only_gep_asc(
-; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; LOWER-ARGS-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
-; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
-; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4
-; LOWER-ARGS-NEXT: ret void
-;
-; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep_asc(
-; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; COPY-NEXT: [[ENTRY:.*:]]
-; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
-; COPY-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
-; COPY-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[ASC]], align 4
-; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4
-; COPY-NEXT: ret void
-;
-; PTX-LABEL: read_only_gep_asc(
-; PTX: {
-; PTX-NEXT: .reg .b32 %r<2>;
-; PTX-NEXT: .reg .b64 %rd<3>;
-; PTX-EMPTY:
-; PTX-NEXT: // %bb.0: // %entry
-; PTX-NEXT: ld.param.u64 %rd1, [read_only_gep_asc_param_0];
-; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
-; PTX-NEXT: ld.param.u32 %r1, [read_only_gep_asc_param_1+4];
-; PTX-NEXT: st.global.u32 [%rd2], %r1;
-; PTX-NEXT: ret;
-entry:
- %b = getelementptr inbounds nuw i8, ptr %s, i64 4
- %asc = addrspacecast ptr %b to ptr addrspace(101)
- %i = load i32, ptr addrspace(101) %asc, align 4
- store i32 %i, ptr %out, align 4
- ret void
-}
+;; TODO: This test has been disabled because the addrspacecast is not legal on
+;; sm_60, and not well supported within nvptx-lower-args. We should determine
+;; in what cases it is safe to make assumptions about the address of a byval
+;; parameter and improve our handling of addrspacecast in nvptx-lower-args.
-; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local ptx_kernel void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; COMMON-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0(
-; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; COMMON-NEXT: [[ENTRY:.*:]]
-; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
-; COMMON-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
-; COMMON-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr
-; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4
-; COMMON-NEXT: store i32 [[I]], ptr [[OUT]], align 4
-; COMMON-NEXT: ret void
+; define dso_local ptx_kernel void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; entry:
+; %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+; %asc = addrspacecast ptr %b to ptr addrspace(101)
+; %i = load i32, ptr addrspace(101) %asc, align 4
+; store i32 %i, ptr %out, align 4
+; ret void
+; }
;
-; PTX-LABEL: read_only_gep_asc0(
-; PTX: {
-; PTX-NEXT: .reg .b32 %r<2>;
-; PTX-NEXT: .reg .b64 %rd<3>;
-; PTX-EMPTY:
-; PTX-NEXT: // %bb.0: // %entry
-; PTX-NEXT: ld.param.u64 %rd1, [read_only_gep_asc0_param_0];
-; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
-; PTX-NEXT: ld.param.u32 %r1, [read_only_gep_asc0_param_1+4];
-; PTX-NEXT: st.global.u32 [%rd2], %r1;
-; PTX-NEXT: ret;
-entry:
- %b = getelementptr inbounds nuw i8, ptr %s, i64 4
- %asc = addrspacecast ptr %b to ptr addrspace(101)
- %asc0 = addrspacecast ptr addrspace(101) %asc to ptr
- %i = load i32, ptr %asc0, align 4
- store i32 %i, ptr %out, align 4
- ret void
-}
+; define dso_local ptx_kernel void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; entry:
+; %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+; %asc = addrspacecast ptr %b to ptr addrspace(101)
+; %asc0 = addrspacecast ptr addrspace(101) %asc to ptr
+; %i = load i32, ptr %asc0, align 4
+; store i32 %i, ptr %out, align 4
+; ret void
+; }
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr(
-; SM_60-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_60-NEXT: [[ENTRY:.*:]]
-; SM_60-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_60-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR5:[0-9]+]]
-; SM_60-NEXT: ret void
-;
-; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr(
-; SM_70-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_70-NEXT: [[ENTRY:.*:]]
-; SM_70-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_70-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR6:[0-9]+]]
-; SM_70-NEXT: ret void
-;
-; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr(
-; COPY-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; COPY-NEXT: [[ENTRY:.*:]]
-; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR5:[0-9]+]]
-; COPY-NEXT: ret void
+; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptr(
+; COMMON-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
+; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR6:[0-9]+]]
+; COMMON-NEXT: ret void
;
; PTX-LABEL: escape_ptr(
; PTX: {
-; PTX-NEXT: .local .align 4 .b8 __local_depot4[8];
+; PTX-NEXT: .local .align 4 .b8 __local_depot2[8];
; PTX-NEXT: .reg .b64 %SP;
; PTX-NEXT: .reg .b64 %SPL;
; PTX-NEXT: .reg .b32 %r<3>;
; PTX-NEXT: .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0: // %entry
-; PTX-NEXT: mov.b64 %SPL, __local_depot4;
+; PTX-NEXT: mov.b64 %SPL, __local_depot2;
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
; PTX-NEXT: add.u64 %rd1, %SP, 0;
; PTX-NEXT: add.u64 %rd2, %SPL, 0;
@@ -244,46 +167,26 @@
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
-; SM_60-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_60-NEXT: [[ENTRY:.*:]]
-; SM_60-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_60-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; SM_60-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
-; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR5]]
-; SM_60-NEXT: ret void
-;
-; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
-; SM_70-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_70-NEXT: [[ENTRY:.*:]]
-; SM_70-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_70-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; SM_70-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
-; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR6]]
-; SM_70-NEXT: ret void
-;
-; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
-; COPY-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; COPY-NEXT: [[ENTRY:.*:]]
-; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
-; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR5]]
-; COPY-NEXT: ret void
+; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
+; COMMON-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
+; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
+; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR6]]
+; COMMON-NEXT: ret void
;
; PTX-LABEL: escape_ptr_gep(
; PTX: {
-; PTX-NEXT: .local .align 4 .b8 __local_depot5[8];
+; PTX-NEXT: .local .align 4 .b8 __local_depot3[8];
; PTX-NEXT: .reg .b64 %SP;
; PTX-NEXT: .reg .b64 %SPL;
; PTX-NEXT: .reg .b32 %r<3>;
; PTX-NEXT: .reg .b64 %rd<4>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0: // %entry
-; PTX-NEXT: mov.b64 %SPL, __local_depot5;
+; PTX-NEXT: mov.b64 %SPL, __local_depot3;
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
; PTX-NEXT: add.u64 %rd1, %SP, 0;
; PTX-NEXT: add.u64 %rd2, %SPL, 0;
@@ -314,21 +217,21 @@
; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: store ptr [[S1]], ptr [[OUT]], align 8
; COMMON-NEXT: ret void
;
; PTX-LABEL: escape_ptr_store(
; PTX: {
-; PTX-NEXT: .local .align 4 .b8 __local_depot6[8];
+; PTX-NEXT: .local .align 4 .b8 __local_depot4[8];
; PTX-NEXT: .reg .b64 %SP;
; PTX-NEXT: .reg .b64 %SPL;
; PTX-NEXT: .reg .b32 %r<3>;
; PTX-NEXT: .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0: // %entry
-; PTX-NEXT: mov.b64 %SPL, __local_depot6;
+; PTX-NEXT: mov.b64 %SPL, __local_depot4;
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
; PTX-NEXT: ld.param.u64 %rd1, [escape_ptr_store_param_0];
; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
@@ -351,7 +254,7 @@
; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
; COMMON-NEXT: store ptr [[B]], ptr [[OUT]], align 8
@@ -359,14 +262,14 @@
;
; PTX-LABEL: escape_ptr_gep_store(
; PTX: {
-; PTX-NEXT: .local .align 4 .b8 __local_depot7[8];
+; PTX-NEXT: .local .align 4 .b8 __local_depot5[8];
; PTX-NEXT: .reg .b64 %SP;
; PTX-NEXT: .reg .b64 %SPL;
; PTX-NEXT: .reg .b32 %r<3>;
; PTX-NEXT: .reg .b64 %rd<6>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0: // %entry
-; PTX-NEXT: mov.b64 %SPL, __local_depot7;
+; PTX-NEXT: mov.b64 %SPL, __local_depot5;
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
; PTX-NEXT: ld.param.u64 %rd1, [escape_ptr_gep_store_param_0];
; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
@@ -391,7 +294,7 @@
; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S1]] to i64
; COMMON-NEXT: store i64 [[I]], ptr [[OUT]], align 8
@@ -399,14 +302,14 @@
;
; PTX-LABEL: escape_ptrtoint(
; PTX: {
-; PTX-NEXT: .local .align 4 .b8 __local_depot8[8];
+; PTX-NEXT: .local .align 4 .b8 __local_depot6[8];
; PTX-NEXT: .reg .b64 %SP;
; PTX-NEXT: .reg .b64 %SPL;
; PTX-NEXT: .reg .b32 %r<3>;
; PTX-NEXT: .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0: // %entry
-; PTX-NEXT: mov.b64 %SPL, __local_depot8;
+; PTX-NEXT: mov.b64 %SPL, __local_depot6;
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
; PTX-NEXT: ld.param.u64 %rd1, [escape_ptrtoint_param_0];
; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
@@ -429,17 +332,14 @@
; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT]], ptr addrspace(101) [[S3]], i64 16, i1 true)
; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COPY-NEXT: [[ENTRY:.*:]]
-; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S1]], i64 16, i1 true)
+; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S]], i64 16, i1 true)
; COPY-NEXT: ret void
;
; PTX-LABEL: memcpy_from_param(
@@ -492,17 +392,14 @@
; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign(
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT]], ptr addrspace(101) [[S3]], i64 16, i1 true)
; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COPY-NEXT: [[ENTRY:.*:]]
-; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 8
-; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[S1]], ptr addrspace(101) align 8 [[S2]], i64 8, i1 false)
-; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S1]], i64 16, i1 true)
+; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S]], i64 16, i1 true)
; COPY-NEXT: ret void
;
; PTX-LABEL: memcpy_from_param_noalign(
@@ -551,26 +448,26 @@
}
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
; COMMON-LABEL: define dso_local ptx_kernel void @memcpy_to_param(
-; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef readnone byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S1]], ptr [[IN]], i64 16, i1 true)
; COMMON-NEXT: ret void
;
; PTX-LABEL: memcpy_to_param(
; PTX: {
-; PTX-NEXT: .local .align 8 .b8 __local_depot11[8];
+; PTX-NEXT: .local .align 8 .b8 __local_depot9[8];
; PTX-NEXT: .reg .b64 %SP;
; PTX-NEXT: .reg .b64 %SPL;
; PTX-NEXT: .reg .b32 %r<3>;
; PTX-NEXT: .reg .b64 %rd<48>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0: // %entry
-; PTX-NEXT: mov.b64 %SPL, __local_depot11;
+; PTX-NEXT: mov.b64 %SPL, __local_depot9;
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
; PTX-NEXT: ld.param.u64 %rd1, [memcpy_to_param_param_0];
; PTX-NEXT: add.u64 %rd3, %SPL, 0;
@@ -636,7 +533,7 @@
; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[BB:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN]], align 4
; COMMON-NEXT: store i32 [[I]], ptr [[S1]], align 4
@@ -659,10 +556,10 @@
; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
; SM_60-NEXT: [[BB:.*:]]
; SM_60-NEXT: [[INPUT24:%.*]] = alloca i32, align 4
-; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; SM_60-NEXT: [[INPUT25:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT24]], ptr addrspace(101) align 4 [[INPUT25]], i64 4, i1 false)
; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
-; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; SM_60-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]]
; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
@@ -672,10 +569,10 @@
; SM_70-LABEL: define ptx_kernel void @test_select(
; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
; SM_70-NEXT: [[BB:.*:]]
-; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
-; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; SM_70-NEXT: [[TMP0:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
+; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP0]] to ptr
+; SM_70-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; SM_70-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]]
; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4
@@ -685,10 +582,10 @@
; COPY-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
; COPY-NEXT: [[BB:.*:]]
; COPY-NEXT: [[INPUT23:%.*]] = alloca i32, align 4
-; COPY-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; COPY-NEXT: [[INPUT24:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false)
; COPY-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
-; COPY-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; COPY-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
; COPY-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]]
; COPY-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
@@ -719,7 +616,7 @@
; PTX_70-NEXT: .reg .pred %p<2>;
; PTX_70-NEXT: .reg .b16 %rs<3>;
; PTX_70-NEXT: .reg .b32 %r<2>;
-; PTX_70-NEXT: .reg .b64 %rd<10>;
+; PTX_70-NEXT: .reg .b64 %rd<6>;
; PTX_70-EMPTY:
; PTX_70-NEXT: // %bb.0: // %bb
; PTX_70-NEXT: ld.param.u8 %rs1, [test_select_param_3];
@@ -729,12 +626,8 @@
; PTX_70-NEXT: ld.param.u64 %rd2, [test_select_param_2];
; PTX_70-NEXT: cvta.to.global.u64 %rd3, %rd2;
; PTX_70-NEXT: mov.b64 %rd4, test_select_param_1;
-; PTX_70-NEXT: mov.b64 %rd5, %rd4;
-; PTX_70-NEXT: cvta.param.u64 %rd6, %rd5;
-; PTX_70-NEXT: mov.b64 %rd7, %rd1;
-; PTX_70-NEXT: cvta.param.u64 %rd8, %rd7;
-; PTX_70-NEXT: selp.b64 %rd9, %rd8, %rd6, %p1;
-; PTX_70-NEXT: ld.u32 %r1, [%rd9];
+; PTX_70-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1;
+; PTX_70-NEXT: ld.param.u32 %r1, [%rd5];
; PTX_70-NEXT: st.global.u32 [%rd3], %r1;
; PTX_70-NEXT: ret;
bb:
@@ -749,10 +642,10 @@
; COMMON-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
; COMMON-NEXT: [[BB:.*:]]
; COMMON-NEXT: [[INPUT23:%.*]] = alloca i32, align 4
-; COMMON-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; COMMON-NEXT: [[INPUT24:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false)
; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
-; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; COMMON-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]]
; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4
@@ -760,7 +653,7 @@
;
; PTX-LABEL: test_select_write(
; PTX: {
-; PTX-NEXT: .local .align 4 .b8 __local_depot14[8];
+; PTX-NEXT: .local .align 4 .b8 __local_depot12[8];
; PTX-NEXT: .reg .b64 %SP;
; PTX-NEXT: .reg .b64 %SPL;
; PTX-NEXT: .reg .pred %p<2>;
@@ -769,7 +662,7 @@
; PTX-NEXT: .reg .b64 %rd<6>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0: // %bb
-; PTX-NEXT: mov.b64 %SPL, __local_depot14;
+; PTX-NEXT: mov.b64 %SPL, __local_depot12;
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
; PTX-NEXT: ld.param.u8 %rs1, [test_select_write_param_3];
; PTX-NEXT: and.b16 %rs2, %rs1, 1;
@@ -795,10 +688,10 @@
; SM_60-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
; SM_60-NEXT: [[BB:.*:]]
; SM_60-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
-; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; SM_60-NEXT: [[INPUT25:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false)
; SM_60-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; SM_60-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
; SM_60-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; SM_60: [[FIRST]]:
@@ -816,10 +709,10 @@
; SM_70-LABEL: define ptx_kernel void @test_phi(
; SM_70-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
; SM_70-NEXT: [[BB:.*:]]
-; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
-; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; SM_70-NEXT: [[TMP0:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
+; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP0]] to ptr
+; SM_70-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; SM_70-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; SM_70: [[FIRST]]:
; SM_70-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
@@ -837,10 +730,10 @@
; COPY-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
; COPY-NEXT: [[BB:.*:]]
; COPY-NEXT: [[INPUT23:%.*]] = alloca [[STRUCT_S]], align 8
-; COPY-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; COPY-NEXT: [[INPUT24:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT23]], ptr addrspace(101) align 8 [[INPUT24]], i64 8, i1 false)
; COPY-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; COPY-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
; COPY-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; COPY: [[FIRST]]:
@@ -869,10 +762,10 @@
; PTX_60-NEXT: ld.param.u64 %rd2, [test_phi_param_2];
; PTX_60-NEXT: cvta.to.global.u64 %rd1, %rd2;
; PTX_60-NEXT: ld.param.u32 %r4, [test_phi_param_0];
-; PTX_60-NEXT: @%p1 bra $L__BB15_2;
+; PTX_60-NEXT: @%p1 bra $L__BB13_2;
; PTX_60-NEXT: // %bb.1: // %second
; PTX_60-NEXT: ld.param.u32 %r4, [test_phi_param_1+4];
-; PTX_60-NEXT: $L__BB15_2: // %merge
+; PTX_60-NEXT: $L__BB13_2: // %merge
; PTX_60-NEXT: st.global.u32 [%rd1], %r4;
; PTX_60-NEXT: ret;
;
@@ -881,25 +774,21 @@
; PTX_70-NEXT: .reg .pred %p<2>;
; PTX_70-NEXT: .reg .b16 %rs<3>;
; PTX_70-NEXT: .reg .b32 %r<2>;
-; PTX_70-NEXT: .reg .b64 %rd<12>;
+; PTX_70-NEXT: .reg .b64 %rd<8>;
; PTX_70-EMPTY:
; PTX_70-NEXT: // %bb.0: // %bb
; PTX_70-NEXT: ld.param.u8 %rs1, [test_phi_param_3];
; PTX_70-NEXT: and.b16 %rs2, %rs1, 1;
; PTX_70-NEXT: setp.ne.b16 %p1, %rs2, 0;
-; PTX_70-NEXT: mov.b64 %rd6, test_phi_param_0;
-; PTX_70-NEXT: ld.param.u64 %rd7, [test_phi_param_2];
-; PTX_70-NEXT: cvta.to.global.u64 %rd1, %rd7;
-; PTX_70-NEXT: mov.b64 %rd10, %rd6;
-; PTX_70-NEXT: cvta.param.u64 %rd11, %rd10;
-; PTX_70-NEXT: @%p1 bra $L__BB15_2;
+; PTX_70-NEXT: mov.b64 %rd7, test_phi_param_0;
+; PTX_70-NEXT: ld.param.u64 %rd6, [test_phi_param_2];
+; PTX_70-NEXT: cvta.to.global.u64 %rd1, %rd6;
+; PTX_70-NEXT: @%p1 bra $L__BB13_2;
; PTX_70-NEXT: // %bb.1: // %second
-; PTX_70-NEXT: mov.b64 %rd8, test_phi_param_1;
-; PTX_70-NEXT: mov.b64 %rd9, %rd8;
-; PTX_70-NEXT: cvta.param.u64 %rd2, %rd9;
-; PTX_70-NEXT: add.s64 %rd11, %rd2, 4;
-; PTX_70-NEXT: $L__BB15_2: // %merge
-; PTX_70-NEXT: ld.u32 %r1, [%rd11];
+; PTX_70-NEXT: mov.b64 %rd2, test_phi_param_1;
+; PTX_70-NEXT: add.s64 %rd7, %rd2, 4;
+; PTX_70-NEXT: $L__BB13_2: // %merge
+; PTX_70-NEXT: ld.param.u32 %r1, [%rd7];
; PTX_70-NEXT: st.global.u32 [%rd1], %r1;
; PTX_70-NEXT: ret;
bb:
@@ -925,10 +814,10 @@
; COMMON-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
; COMMON-NEXT: [[BB:.*:]]
; COMMON-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
-; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; COMMON-NEXT: [[INPUT25:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false)
; COMMON-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; COMMON-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
; COMMON-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; COMMON: [[FIRST]]:
@@ -944,7 +833,7 @@
;
; PTX-LABEL: test_phi_write(
; PTX: {
-; PTX-NEXT: .local .align 4 .b8 __local_depot16[8];
+; PTX-NEXT: .local .align 4 .b8 __local_depot14[8];
; PTX-NEXT: .reg .b64 %SP;
; PTX-NEXT: .reg .b64 %SPL;
; PTX-NEXT: .reg .pred %p<2>;
@@ -953,7 +842,7 @@
; PTX-NEXT: .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0: // %bb
-; PTX-NEXT: mov.b64 %SPL, __local_depot16;
+; PTX-NEXT: mov.b64 %SPL, __local_depot14;
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
; PTX-NEXT: ld.param.u8 %rs1, [test_phi_write_param_2];
; PTX-NEXT: and.b16 %rs2, %rs1, 1;
@@ -964,10 +853,10 @@
; PTX-NEXT: add.u64 %rd6, %SPL, 4;
; PTX-NEXT: ld.param.u32 %r2, [test_phi_write_param_0];
; PTX-NEXT: st.u32 [%SP+4], %r2;
-; PTX-NEXT: @%p1 bra $L__BB16_2;
+; PTX-NEXT: @%p1 bra $L__BB14_2;
; PTX-NEXT: // %bb.1: // %second
; PTX-NEXT: mov.b64 %rd6, %rd1;
-; PTX-NEXT: $L__BB16_2: // %merge
+; PTX-NEXT: $L__BB14_2: // %merge
; PTX-NEXT: mov.b32 %r3, 1;
; PTX-NEXT: st.local.u32 [%rd6], %r3;
; PTX-NEXT: ret;
@@ -988,6 +877,69 @@
ret void
}
+define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
+; COMMON-LABEL: define ptx_kernel void @test_forward_byval_arg(
+; COMMON-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] {
+; COMMON-NEXT: [[INPUT1:%.*]] = alloca i32, align 4
+; COMMON-NEXT: [[INPUT2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT1]], ptr addrspace(101) align 4 [[INPUT2]], i64 4, i1 false)
+; COMMON-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT1]])
+; COMMON-NEXT: ret void
+;
+; PTX-LABEL: test_forward_byval_arg(
+; PTX: {
+; PTX-NEXT: .local .align 4 .b8 __local_depot15[4];
+; PTX-NEXT: .reg .b64 %SP;
+; PTX-NEXT: .reg .b64 %SPL;
+; PTX-NEXT: .reg .b32 %r<2>;
+; PTX-NEXT: .reg .b64 %rd<3>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %SPL, __local_depot15;
+; PTX-NEXT: add.u64 %rd2, %SPL, 0;
+; PTX-NEXT: ld.param.u32 %r1, [test_forward_byval_arg_param_0];
+; PTX-NEXT: st.local.u32 [%rd2], %r1;
+; PTX-NEXT: { // callseq 2, 0
+; PTX-NEXT: .param .align 4 .b8 param0[4];
+; PTX-NEXT: st.param.b32 [param0], %r1;
+; PTX-NEXT: call.uni
+; PTX-NEXT: device_func,
+; PTX-NEXT: (
+; PTX-NEXT: param0
+; PTX-NEXT: );
+; PTX-NEXT: } // callseq 2
+; PTX-NEXT: ret;
+ call void @device_func(ptr byval(i32) align 4 %input)
+ ret void
+}
+
+define void @device_func(ptr byval(i32) align 4 %input) {
+; COMMON-LABEL: define void @device_func(
+; COMMON-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] {
+; COMMON-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT]])
+; COMMON-NEXT: ret void
+;
+; PTX-LABEL: device_func(
+; PTX: {
+; PTX-NEXT: .reg .b32 %r<2>;
+; PTX-NEXT: .reg .b64 %rd<2>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: ld.param.u32 %r1, [device_func_param_0];
+; PTX-NEXT: { // callseq 3, 0
+; PTX-NEXT: .param .align 4 .b8 param0[4];
+; PTX-NEXT: st.param.b32 [param0], %r1;
+; PTX-NEXT: call.uni
+; PTX-NEXT: device_func,
+; PTX-NEXT: (
+; PTX-NEXT: param0
+; PTX-NEXT: );
+; PTX-NEXT: } // callseq 3
+; PTX-NEXT: ret;
+ call void @device_func(ptr byval(i32) align 4 %input)
+ ret void
+}
+
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "no-trapping-math"="true" "target-cpu"="sm_60" "target-features"="+ptx78,+sm_60" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected
index e470569..ad0b11e 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected
@@ -6,7 +6,7 @@
define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct.St8x4) align 8 %in, ptr nocapture noundef writeonly %ret) {
; CHECK-LABEL: caller_St8x4(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .b64 %rd<13>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
@@ -27,11 +27,11 @@
; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [retval0];
; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [retval0+16];
; CHECK-NEXT: } // callseq 0
-; CHECK-NEXT: ld.param.u32 %r3, [caller_St8x4_param_1];
-; CHECK-NEXT: st.u64 [%r3], %rd5;
-; CHECK-NEXT: st.u64 [%r3+8], %rd6;
-; CHECK-NEXT: st.u64 [%r3+16], %rd7;
-; CHECK-NEXT: st.u64 [%r3+24], %rd8;
+; CHECK-NEXT: ld.param.u32 %r2, [caller_St8x4_param_1];
+; CHECK-NEXT: st.u64 [%r2], %rd5;
+; CHECK-NEXT: st.u64 [%r2+8], %rd6;
+; CHECK-NEXT: st.u64 [%r2+16], %rd7;
+; CHECK-NEXT: st.u64 [%r2+24], %rd8;
; CHECK-NEXT: ret;
%call = tail call fastcc [4 x i64] @callee_St8x4(ptr noundef nonnull byval(%struct.St8x4) align 8 %in) #2
%.fca.0.extract = extractvalue [4 x i64] %call, 0