[AMDGPU] Implement addrspacecast from flat <-> private on gfx1250 (#152218)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 1fdf272..a6e4a63 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2271,6 +2271,9 @@
const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
? AMDGPU::SRC_SHARED_BASE
: AMDGPU::SRC_PRIVATE_BASE;
+ assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
+ !ST.hasGloballyAddressableScratch()) &&
+ "Cannot use src_private_base with globally addressable scratch!");
// FIXME: It would be more natural to emit a COPY here, but then copy
// coalescing would kick in and it would think it's okay to use the "HI"
// subregister (instead of extracting the HI 32 bits) which is an artificial
@@ -2396,11 +2399,30 @@
if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
+ auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
+ if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ ST.hasGloballyAddressableScratch()) {
+ // flat -> private with globally addressable scratch: subtract
+ // src_flat_scratch_base_lo.
+ const LLT S32 = LLT::scalar(32);
+ Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
+ Register FlatScratchBaseLo =
+ B.buildInstr(AMDGPU::S_MOV_B32, {S32},
+ {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
+ .getReg(0);
+ MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
+ Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
+ return B.buildIntToPtr(Dst, Sub).getReg(0);
+ }
+
+ // Extract low 32-bits of the pointer.
+ return B.buildExtract(Dst, Src, 0).getReg(0);
+ };
+
// For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
// G_ADDRSPACE_CAST we need to guess.
if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
- // Extract low 32-bits of the pointer.
- B.buildExtract(Dst, Src, 0);
+ castFlatToLocalOrPrivate(Dst);
MI.eraseFromParent();
return true;
}
@@ -2411,7 +2433,7 @@
auto FlatNull = B.buildConstant(SrcTy, 0);
// Extract low 32-bits of the pointer.
- auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
+ auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
auto CmpRes =
B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
@@ -2425,14 +2447,45 @@
(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
- Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
- if (!ApertureReg.isValid())
- return false;
-
// Coerce the type of the low half of the result so we can use
// merge_values.
Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
+ if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ ST.hasGloballyAddressableScratch()) {
+ // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
+ // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
+ Register AllOnes = B.buildConstant(S32, -1).getReg(0);
+ Register ThreadID = B.buildConstant(S32, 0).getReg(0);
+ ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
+ .addUse(AllOnes)
+ .addUse(ThreadID)
+ .getReg(0);
+ if (ST.isWave64()) {
+ ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
+ .addUse(AllOnes)
+ .addUse(ThreadID)
+ .getReg(0);
+ }
+ Register ShAmt =
+ B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
+ Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
+ Register CvtPtr =
+ B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
+ // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
+ // 64-bit hi:lo value.
+ Register FlatScratchBase =
+ B.buildInstr(AMDGPU::S_MOV_B64, {S64},
+ {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
+ .getReg(0);
+ MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
+ return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
+ }
+
+ Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
+ if (!ApertureReg.isValid())
+ return false;
+
// TODO: Should we allow mismatched types but matching sizes in merges to
// avoid the ptrtoint?
return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
@@ -5788,11 +5841,25 @@
MachineRegisterInfo &MRI,
MachineIRBuilder &B,
unsigned AddrSpace) const {
- Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
- auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
+ const LLT S32 = LLT::scalar(32);
+ auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
Register Hi32 = Unmerge.getReg(1);
- B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
+ ST.hasGloballyAddressableScratch()) {
+ Register FlatScratchBaseHi =
+ B.buildInstr(AMDGPU::S_MOV_B32, {S32},
+ {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
+ .getReg(0);
+ MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
+ // Test bits 63..58 against the aperture address.
+ Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
+ B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
+ B.buildConstant(S32, 1u << 26));
+ } else {
+ Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
+ B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
+ }
MI.eraseFromParent();
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4d67e4a..63826b7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2098,10 +2098,17 @@
bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
- // Flat -> private/local is a simple truncate.
- // Flat -> global is no-op
- if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
+ if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
+ if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ Subtarget->hasGloballyAddressableScratch()) {
+ // Flat -> private requires subtracting src_flat_scratch_base_lo.
+ return false;
+ }
+
+ // Flat -> private/local is a simple truncate.
+ // Flat -> global is no-op
return true;
+ }
const GCNTargetMachine &TM =
static_cast<const GCNTargetMachine &>(getTargetMachine());
@@ -7650,6 +7657,9 @@
const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
? AMDGPU::SRC_SHARED_BASE
: AMDGPU::SRC_PRIVATE_BASE;
+ assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
+ !Subtarget->hasGloballyAddressableScratch()) &&
+ "Cannot use src_private_base with globally addressable scratch!");
// Note: this feature (register) is broken. When used as a 32-bit operand,
// it returns a wrong value (all zeroes?). The real value is in the upper 32
// bits.
@@ -7760,6 +7770,18 @@
DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
+ if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ Subtarget->hasGloballyAddressableScratch()) {
+ // flat -> private with globally addressable scratch: subtract
+ // src_flat_scratch_base_lo.
+ SDValue FlatScratchBaseLo(
+ DAG.getMachineNode(
+ AMDGPU::S_MOV_B32, SL, MVT::i32,
+ DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
+ 0);
+ Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
+ }
+
if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
return Ptr;
@@ -7776,11 +7798,40 @@
if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
-
- SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
- SDValue CvtPtr =
- DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
- CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
+ SDValue CvtPtr;
+ if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ Subtarget->hasGloballyAddressableScratch()) {
+ // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
+ // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
+ SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
+ SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
+ ThreadID = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
+ DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
+ AllOnes, ThreadID);
+ if (Subtarget->isWave64())
+ ThreadID = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
+ DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
+ AllOnes, ThreadID);
+ SDValue ShAmt = DAG.getShiftAmountConstant(
+ 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
+ SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
+ CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
+ CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
+ // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
+ // 64-bit hi:lo value.
+ SDValue FlatScratchBase = {
+ DAG.getMachineNode(
+ AMDGPU::S_MOV_B64, SL, MVT::i64,
+ DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
+ 0};
+ CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
+ } else {
+ SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
+ CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
+ CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
+ }
if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
return CvtPtr;
@@ -9424,15 +9475,29 @@
case Intrinsic::amdgcn_is_shared:
case Intrinsic::amdgcn_is_private: {
SDLoc SL(Op);
+ SDValue SrcVec =
+ DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
+ SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
+ DAG.getConstant(1, SL, MVT::i32));
+
unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
? AMDGPUAS::LOCAL_ADDRESS
: AMDGPUAS::PRIVATE_ADDRESS;
- SDValue Aperture = getSegmentAperture(AS, SL, DAG);
- SDValue SrcVec =
- DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
+ if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
+ Subtarget->hasGloballyAddressableScratch()) {
+ SDValue FlatScratchBaseHi(
+ DAG.getMachineNode(
+ AMDGPU::S_MOV_B32, DL, MVT::i32,
+ DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
+ 0);
+ // Test bits 63..58 against the aperture address.
+ return DAG.getSetCC(
+ SL, MVT::i1,
+ DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
+ DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
+ }
- SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
- DAG.getConstant(1, SL, MVT::i32));
+ SDValue Aperture = getSegmentAperture(AS, SL, DAG);
return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
}
case Intrinsic::amdgcn_perm:
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index ed6b973..81655f5 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -866,7 +866,8 @@
def SReg_64_XEXEC_XNULL : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32,
(add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SRC_SHARED_BASE,
- SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA)> {
+ SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA,
+ SRC_FLAT_SCRATCH_BASE)> {
let CopyCost = 1;
let AllocationPriority = 1;
let HasSGPR = 1;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
index 6a4522f..d69a3e1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
@@ -141,11 +141,11 @@
; SIVI-NEXT: {{ $}}
; SIVI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; SIVI-NEXT: [[COPY1:%[0-9]+]]:_(p5) = COPY $vgpr0
+ ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY1]](p5)
; SIVI-NEXT: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
; SIVI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 68
; SIVI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = nuw inbounds G_PTR_ADD [[COPY2]], [[C]](s64)
; SIVI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
- ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY1]](p5)
; SIVI-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32)
; SIVI-NEXT: [[C1:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1
; SIVI-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
@@ -157,9 +157,9 @@
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+ ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p5)
; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_private_base
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64)
- ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p5)
; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32)
; GFX9-NEXT: [[C:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1
; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
@@ -210,11 +210,11 @@
; SIVI-NEXT: {{ $}}
; SIVI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; SIVI-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr0
+ ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY1]](p3)
; SIVI-NEXT: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
; SIVI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
; SIVI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = nuw inbounds G_PTR_ADD [[COPY2]], [[C]](s64)
; SIVI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 64, addrspace 4)
- ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY1]](p3)
; SIVI-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32)
; SIVI-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1
; SIVI-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
@@ -226,9 +226,9 @@
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+ ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p3)
; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_shared_base
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64)
- ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p3)
; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32)
; GFX9-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1
; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
@@ -354,20 +354,20 @@
; SIVI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; SIVI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1
; SIVI-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY1]](<2 x p3>)
+ ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3)
; SIVI-NEXT: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
; SIVI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
; SIVI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = nuw inbounds G_PTR_ADD [[COPY2]], [[C]](s64)
; SIVI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 64, addrspace 4)
- ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3)
; SIVI-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32)
; SIVI-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1
; SIVI-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
; SIVI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C1]]
; SIVI-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C2]]
+ ; SIVI-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3)
; SIVI-NEXT: [[COPY3:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
; SIVI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = nuw inbounds G_PTR_ADD [[COPY3]], [[C]](s64)
; SIVI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32), align 64, addrspace 4)
- ; SIVI-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3)
; SIVI-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT1]](s32), [[LOAD1]](s32)
; SIVI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C1]]
; SIVI-NEXT: [[SELECT1:%[0-9]+]]:_(p0) = G_SELECT [[ICMP1]](s1), [[MV1]], [[C2]]
@@ -379,17 +379,17 @@
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1
; GFX9-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY]](<2 x p3>)
+ ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3)
; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_shared_base
; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64)
- ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3)
; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV3]](s32)
; GFX9-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1
; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C]]
; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]]
+ ; GFX9-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3)
; GFX9-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_shared_base
; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_1]](s64)
- ; GFX9-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3)
; GFX9-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT1]](s32), [[UV5]](s32)
; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C]]
; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(p0) = G_SELECT [[ICMP1]](s1), [[MV1]], [[C1]]
@@ -506,19 +506,19 @@
; SIVI-NEXT: {{ $}}
; SIVI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; SIVI-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
+ ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[FRAME_INDEX]](p5)
; SIVI-NEXT: [[COPY1:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
; SIVI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 68
; SIVI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64)
; SIVI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
- ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[FRAME_INDEX]](p5)
; SIVI-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32)
; SIVI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p0)
;
; GFX9-LABEL: name: test_addrspacecast_p5_fi_to_p0
; GFX9: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
+ ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[FRAME_INDEX]](p5)
; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_private_base
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64)
- ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[FRAME_INDEX]](p5)
; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p0)
%0:_(p5) = G_FRAME_INDEX %stack.0
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
new file mode 100644
index 0000000..4b6375c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
@@ -0,0 +1,134 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+; Test code sequences for addrspacecast with globally addressable scratch.
+
+target triple = "amdgcn-amd-amdhsa"
+
+define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX1250-SDAG-LABEL: use_private_to_flat_addrspacecast:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
+; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_lshlrev_b32 v1, 20, v0
+; GFX1250-SDAG-NEXT: s_cmp_lg_u32 s2, -1
+; GFX1250-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0, v1
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
+; GFX1250-SDAG-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, -1
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0
+; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1250-GISEL-NEXT: s_cselect_b32 s0, 1, 0
+; GFX1250-GISEL-NEXT: s_and_b32 s0, 1, s0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v1, vcc_lo
+; GFX1250-GISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0, v1
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
+; GFX1250-GISEL-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX1250-GISEL-NEXT: s_endpgm
+ %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+ store volatile i32 0, ptr %stof
+ ret void
+}
+
+define amdgpu_kernel void @use_private_to_flat_addrspacecast_nonnull(ptr addrspace(5) %ptr) {
+; GFX1250-SDAG-LABEL: use_private_to_flat_addrspacecast_nonnull:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 20, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
+; GFX1250-SDAG-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast_nonnull:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v2, 20, v2
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v1, vcc_lo
+; GFX1250-GISEL-NEXT: flat_store_b32 v[0:1], v3 scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX1250-GISEL-NEXT: s_endpgm
+ %stof = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) %ptr)
+ store volatile i32 0, ptr %stof
+ ret void
+}
+
+define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) {
+; GFX1250-LABEL: use_flat_to_private_addrspacecast:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_sub_co_i32 s2, s0, s2
+; GFX1250-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-NEXT: s_cselect_b32 s0, s2, -1
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
+ %ftos = addrspacecast ptr %ptr to ptr addrspace(5)
+ store volatile i32 0, ptr addrspace(5) %ftos
+ ret void
+}
+
+define amdgpu_kernel void @use_flat_to_private_addrspacecast_nonnull(ptr %ptr) {
+; GFX1250-SDAG-LABEL: use_flat_to_private_addrspacecast_nonnull:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_sub_co_i32 s0, s0, s1
+; GFX1250-SDAG-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: use_flat_to_private_addrspacecast_nonnull:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s0, s1
+; GFX1250-GISEL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX1250-GISEL-NEXT: s_endpgm
+ %ftos = call ptr addrspace(5) @llvm.amdgcn.addrspacecast.nonnull.p5.p0(ptr %ptr)
+ store volatile i32 0, ptr addrspace(5) %ftos
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index 2ff66c9..7d36c9f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -252,13 +252,15 @@
; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB10_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -277,9 +279,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB10_2
; GFX1250-SDAG-NEXT: .LBB10_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
@@ -292,15 +296,16 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB10_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -314,13 +319,16 @@
; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB10_2
; GFX1250-GISEL-NEXT: .LBB10_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
@@ -344,11 +352,13 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB11_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -367,8 +377,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB11_2
; GFX1250-SDAG-NEXT: .LBB11_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
@@ -381,18 +394,19 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB11_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -406,13 +420,16 @@
; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1250-GISEL-NEXT: .LBB11_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
@@ -433,11 +450,13 @@
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB12_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -455,9 +474,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB12_2
; GFX1250-SDAG-NEXT: .LBB12_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
@@ -465,13 +486,14 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB12_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -483,14 +505,17 @@
; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB12_2
; GFX1250-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
@@ -508,10 +533,12 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB13_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -529,8 +556,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB13_2
; GFX1250-SDAG-NEXT: .LBB13_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
@@ -538,16 +568,17 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB13_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -559,14 +590,17 @@
; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB13_2
; GFX1250-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
@@ -642,13 +676,15 @@
; GFX1250-SDAG-LABEL: flat_add_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB18_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -667,9 +703,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB18_2
; GFX1250-SDAG-NEXT: .LBB18_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3]
@@ -683,15 +721,16 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB18_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -705,13 +744,16 @@
; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB18_2
; GFX1250-GISEL-NEXT: .LBB18_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[4:5]
@@ -736,11 +778,13 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB19_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -759,8 +803,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB19_2
; GFX1250-SDAG-NEXT: .LBB19_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3]
@@ -774,18 +821,19 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB19_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -799,13 +847,16 @@
; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB19_2
; GFX1250-GISEL-NEXT: .LBB19_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[4:5]
@@ -827,11 +878,13 @@
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB20_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -849,9 +902,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB20_2
; GFX1250-SDAG-NEXT: .LBB20_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
@@ -862,13 +917,14 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB20_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -880,14 +936,17 @@
; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB20_2
; GFX1250-GISEL-NEXT: .LBB20_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5]
@@ -908,10 +967,12 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB21_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -929,8 +990,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB21_2
; GFX1250-SDAG-NEXT: .LBB21_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
@@ -941,16 +1005,17 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB21_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -962,14 +1027,17 @@
; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB21_2
; GFX1250-GISEL-NEXT: .LBB21_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5]
@@ -1048,13 +1116,15 @@
; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB26_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -1073,9 +1143,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB26_2
; GFX1250-SDAG-NEXT: .LBB26_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[2:3]
@@ -1089,15 +1161,16 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB26_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -1111,13 +1184,16 @@
; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB26_2
; GFX1250-GISEL-NEXT: .LBB26_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[4:5]
@@ -1142,11 +1218,13 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB27_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -1165,8 +1243,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB27_2
; GFX1250-SDAG-NEXT: .LBB27_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[2:3]
@@ -1180,18 +1261,19 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB27_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -1205,13 +1287,16 @@
; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB27_2
; GFX1250-GISEL-NEXT: .LBB27_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[4:5]
@@ -1233,11 +1318,13 @@
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB28_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -1255,9 +1342,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB28_2
; GFX1250-SDAG-NEXT: .LBB28_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3]
@@ -1268,13 +1357,14 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB28_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -1286,14 +1376,17 @@
; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB28_2
; GFX1250-GISEL-NEXT: .LBB28_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5]
@@ -1314,10 +1407,12 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB29_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -1335,8 +1430,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB29_2
; GFX1250-SDAG-NEXT: .LBB29_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3]
@@ -1347,16 +1445,17 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB29_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -1368,14 +1467,17 @@
; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB29_2
; GFX1250-GISEL-NEXT: .LBB29_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5]
@@ -1454,13 +1556,15 @@
; GFX1250-SDAG-LABEL: flat_and_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB34_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -1479,9 +1583,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB34_2
; GFX1250-SDAG-NEXT: .LBB34_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_and_b32_e32 v3, v1, v3
@@ -1496,15 +1602,16 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB34_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -1518,13 +1625,16 @@
; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB34_2
; GFX1250-GISEL-NEXT: .LBB34_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, v0, v4
@@ -1550,11 +1660,13 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB35_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -1573,8 +1685,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB35_2
; GFX1250-SDAG-NEXT: .LBB35_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_and_b32_e32 v3, v1, v3
@@ -1589,18 +1704,19 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB35_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -1614,13 +1730,16 @@
; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB35_2
; GFX1250-GISEL-NEXT: .LBB35_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, v0, v4
@@ -1643,11 +1762,13 @@
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB36_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -1665,9 +1786,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB36_2
; GFX1250-SDAG-NEXT: .LBB36_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, v1, v3
@@ -1679,13 +1802,14 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB36_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -1697,14 +1821,17 @@
; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB36_2
; GFX1250-GISEL-NEXT: .LBB36_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v0, v4
@@ -1726,10 +1853,12 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB37_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -1747,8 +1876,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB37_2
; GFX1250-SDAG-NEXT: .LBB37_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, v1, v3
@@ -1760,16 +1892,17 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB37_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -1781,14 +1914,17 @@
; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB37_2
; GFX1250-GISEL-NEXT: .LBB37_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v0, v4
@@ -1868,13 +2004,15 @@
; GFX1250-SDAG-LABEL: flat_or_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB42_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -1893,9 +2031,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB42_2
; GFX1250-SDAG-NEXT: .LBB42_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_or_b32_e32 v3, v1, v3
@@ -1910,15 +2050,16 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB42_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -1932,13 +2073,16 @@
; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB42_2
; GFX1250-GISEL-NEXT: .LBB42_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_or_b32_e32 v2, v0, v4
@@ -1964,11 +2108,13 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB43_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -1987,8 +2133,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB43_2
; GFX1250-SDAG-NEXT: .LBB43_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_or_b32_e32 v3, v1, v3
@@ -2003,18 +2152,19 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB43_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -2028,13 +2178,16 @@
; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB43_2
; GFX1250-GISEL-NEXT: .LBB43_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_or_b32_e32 v2, v0, v4
@@ -2057,11 +2210,13 @@
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB44_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -2079,9 +2234,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB44_2
; GFX1250-SDAG-NEXT: .LBB44_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_or_b32_e32 v1, v1, v3
@@ -2093,13 +2250,14 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB44_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -2111,14 +2269,17 @@
; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB44_2
; GFX1250-GISEL-NEXT: .LBB44_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_or_b32_e32 v0, v0, v4
@@ -2140,10 +2301,12 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB45_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -2161,8 +2324,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB45_2
; GFX1250-SDAG-NEXT: .LBB45_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_or_b32_e32 v1, v1, v3
@@ -2174,16 +2340,17 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB45_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -2195,14 +2362,17 @@
; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB45_2
; GFX1250-GISEL-NEXT: .LBB45_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_or_b32_e32 v0, v0, v4
@@ -2282,13 +2452,15 @@
; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB50_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -2307,9 +2479,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB50_2
; GFX1250-SDAG-NEXT: .LBB50_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_xor_b32_e32 v3, v1, v3
@@ -2324,15 +2498,16 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB50_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -2346,13 +2521,16 @@
; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB50_2
; GFX1250-GISEL-NEXT: .LBB50_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_xor_b32_e32 v2, v0, v4
@@ -2378,11 +2556,13 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB51_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -2401,8 +2581,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB51_2
; GFX1250-SDAG-NEXT: .LBB51_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_xor_b32_e32 v3, v1, v3
@@ -2417,18 +2600,19 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB51_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -2442,13 +2626,16 @@
; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB51_2
; GFX1250-GISEL-NEXT: .LBB51_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_xor_b32_e32 v2, v0, v4
@@ -2471,11 +2658,13 @@
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB52_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -2493,9 +2682,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB52_2
; GFX1250-SDAG-NEXT: .LBB52_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3
@@ -2507,13 +2698,14 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB52_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -2525,14 +2717,17 @@
; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB52_2
; GFX1250-GISEL-NEXT: .LBB52_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
@@ -2554,10 +2749,12 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB53_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -2575,8 +2772,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB53_2
; GFX1250-SDAG-NEXT: .LBB53_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3
@@ -2588,16 +2788,17 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB53_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -2609,14 +2810,17 @@
; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB53_2
; GFX1250-GISEL-NEXT: .LBB53_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
@@ -2690,13 +2894,15 @@
; GFX1250-SDAG-LABEL: flat_max_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB58_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -2715,10 +2921,12 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB58_2
; GFX1250-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_max_i64 v[2:3], v[0:1], v[2:3]
@@ -2732,15 +2940,16 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB58_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -2753,15 +2962,18 @@
; GFX1250-GISEL-NEXT: .LBB58_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB58_2
; GFX1250-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_max_i64 v[2:3], v[0:1], v[4:5]
@@ -2786,11 +2998,13 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -2809,9 +3023,12 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB59_2
; GFX1250-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_max_i64 v[2:3], v[0:1], v[2:3]
@@ -2825,18 +3042,19 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB59_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -2849,15 +3067,18 @@
; GFX1250-GISEL-NEXT: .LBB59_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB59_2
; GFX1250-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_max_i64 v[2:3], v[0:1], v[4:5]
@@ -2879,11 +3100,13 @@
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB60_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -2900,9 +3123,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB60_2
; GFX1250-SDAG-NEXT: .LBB60_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_max_i64 v[0:1], v[0:1], v[2:3]
@@ -2913,13 +3138,14 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB60_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -2930,14 +3156,17 @@
; GFX1250-GISEL-NEXT: .LBB60_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB60_2
; GFX1250-GISEL-NEXT: .LBB60_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_max_i64 v[0:1], v[0:1], v[4:5]
@@ -2958,10 +3187,12 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB61_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -2978,8 +3209,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB61_2
; GFX1250-SDAG-NEXT: .LBB61_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_max_i64 v[0:1], v[0:1], v[2:3]
@@ -2990,16 +3224,17 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB61_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -3010,14 +3245,17 @@
; GFX1250-GISEL-NEXT: .LBB61_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3] offset:-128
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB61_2
; GFX1250-GISEL-NEXT: .LBB61_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_max_i64 v[0:1], v[0:1], v[4:5]
@@ -3090,13 +3328,15 @@
; GFX1250-SDAG-LABEL: flat_min_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB66_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -3115,10 +3355,12 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB66_2
; GFX1250-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_min_i64 v[2:3], v[0:1], v[2:3]
@@ -3132,15 +3374,16 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB66_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -3153,15 +3396,18 @@
; GFX1250-GISEL-NEXT: .LBB66_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB66_2
; GFX1250-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_min_i64 v[2:3], v[0:1], v[4:5]
@@ -3186,11 +3432,13 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -3209,9 +3457,12 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB67_2
; GFX1250-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_min_i64 v[2:3], v[0:1], v[2:3]
@@ -3225,18 +3476,19 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB67_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -3249,15 +3501,18 @@
; GFX1250-GISEL-NEXT: .LBB67_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB67_2
; GFX1250-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_min_i64 v[2:3], v[0:1], v[4:5]
@@ -3279,11 +3534,13 @@
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB68_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -3300,9 +3557,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB68_2
; GFX1250-SDAG-NEXT: .LBB68_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_min_i64 v[0:1], v[0:1], v[2:3]
@@ -3313,13 +3572,14 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB68_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -3330,14 +3590,17 @@
; GFX1250-GISEL-NEXT: .LBB68_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB68_2
; GFX1250-GISEL-NEXT: .LBB68_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_min_i64 v[0:1], v[0:1], v[4:5]
@@ -3358,10 +3621,12 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB69_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -3378,8 +3643,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB69_2
; GFX1250-SDAG-NEXT: .LBB69_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_min_i64 v[0:1], v[0:1], v[2:3]
@@ -3390,16 +3658,17 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB69_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -3410,14 +3679,17 @@
; GFX1250-GISEL-NEXT: .LBB69_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3] offset:-128
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB69_2
; GFX1250-GISEL-NEXT: .LBB69_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_min_i64 v[0:1], v[0:1], v[4:5]
@@ -3490,13 +3762,15 @@
; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB74_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -3515,10 +3789,12 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB74_2
; GFX1250-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_max_u64 v[2:3], v[0:1], v[2:3]
@@ -3532,15 +3808,16 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB74_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -3553,15 +3830,18 @@
; GFX1250-GISEL-NEXT: .LBB74_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB74_2
; GFX1250-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_max_u64 v[2:3], v[0:1], v[4:5]
@@ -3586,11 +3866,13 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -3609,9 +3891,12 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB75_2
; GFX1250-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_max_u64 v[2:3], v[0:1], v[2:3]
@@ -3625,18 +3910,19 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB75_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -3649,15 +3935,18 @@
; GFX1250-GISEL-NEXT: .LBB75_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB75_2
; GFX1250-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_max_u64 v[2:3], v[0:1], v[4:5]
@@ -3679,11 +3968,13 @@
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB76_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -3700,9 +3991,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB76_2
; GFX1250-SDAG-NEXT: .LBB76_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_max_u64 v[0:1], v[0:1], v[2:3]
@@ -3713,13 +4006,14 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB76_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -3730,14 +4024,17 @@
; GFX1250-GISEL-NEXT: .LBB76_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB76_2
; GFX1250-GISEL-NEXT: .LBB76_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_max_u64 v[0:1], v[0:1], v[4:5]
@@ -3758,10 +4055,12 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB77_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -3778,8 +4077,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB77_2
; GFX1250-SDAG-NEXT: .LBB77_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_max_u64 v[0:1], v[0:1], v[2:3]
@@ -3790,16 +4092,17 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB77_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -3810,14 +4113,17 @@
; GFX1250-GISEL-NEXT: .LBB77_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3] offset:-128
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB77_2
; GFX1250-GISEL-NEXT: .LBB77_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_max_u64 v[0:1], v[0:1], v[4:5]
@@ -3890,13 +4196,15 @@
; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB82_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -3915,10 +4223,12 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB82_2
; GFX1250-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_min_u64 v[2:3], v[0:1], v[2:3]
@@ -3932,15 +4242,16 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB82_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -3953,15 +4264,18 @@
; GFX1250-GISEL-NEXT: .LBB82_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB82_2
; GFX1250-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_min_u64 v[2:3], v[0:1], v[4:5]
@@ -3986,11 +4300,13 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -4009,9 +4325,12 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB83_2
; GFX1250-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_min_u64 v[2:3], v[0:1], v[2:3]
@@ -4025,18 +4344,19 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB83_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -4049,15 +4369,18 @@
; GFX1250-GISEL-NEXT: .LBB83_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB83_2
; GFX1250-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_min_u64 v[2:3], v[0:1], v[4:5]
@@ -4079,11 +4402,13 @@
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB84_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -4100,9 +4425,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB84_2
; GFX1250-SDAG-NEXT: .LBB84_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_min_u64 v[0:1], v[0:1], v[2:3]
@@ -4113,13 +4440,14 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB84_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -4130,14 +4458,17 @@
; GFX1250-GISEL-NEXT: .LBB84_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB84_2
; GFX1250-GISEL-NEXT: .LBB84_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_min_u64 v[0:1], v[0:1], v[4:5]
@@ -4158,10 +4489,12 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB85_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -4178,8 +4511,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB85_2
; GFX1250-SDAG-NEXT: .LBB85_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_min_u64 v[0:1], v[0:1], v[2:3]
@@ -4190,16 +4526,17 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB85_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -4210,14 +4547,17 @@
; GFX1250-GISEL-NEXT: .LBB85_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3] offset:-128
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB85_2
; GFX1250-GISEL-NEXT: .LBB85_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_min_u64 v[0:1], v[0:1], v[4:5]
@@ -4310,14 +4650,16 @@
; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, v3
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v3
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB90_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -4338,9 +4680,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB90_2
; GFX1250-SDAG-NEXT: .LBB90_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v2, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
@@ -4356,15 +4700,16 @@
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v7, v4
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v0, v5
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, s0, v3 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB90_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -4380,13 +4725,16 @@
; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v[0:1], v5, v[6:9], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB90_2
; GFX1250-GISEL-NEXT: .LBB90_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
@@ -4414,11 +4762,13 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v3
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB91_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -4439,8 +4789,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB91_2
; GFX1250-SDAG-NEXT: .LBB91_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v2, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
@@ -4456,18 +4809,19 @@
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v7, v4
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, s0, v3 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB91_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -4483,13 +4837,16 @@
; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v[0:1], v5, v[6:9], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB91_2
; GFX1250-GISEL-NEXT: .LBB91_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
@@ -4512,13 +4869,15 @@
; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4
-; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, v3
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v2
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB92_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -4538,9 +4897,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB92_2
; GFX1250-SDAG-NEXT: .LBB92_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
@@ -4553,13 +4914,14 @@
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB92_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -4573,14 +4935,17 @@
; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v0, v[6:9], s[2:3] scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB92_2
; GFX1250-GISEL-NEXT: .LBB92_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
@@ -4603,10 +4968,12 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v2
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB93_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -4626,8 +4993,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB93_2
; GFX1250-SDAG-NEXT: .LBB93_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
@@ -4640,16 +5010,17 @@
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB93_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -4663,14 +5034,17 @@
; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v0, v[6:9], s[2:3] offset:-128 scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB93_2
; GFX1250-GISEL-NEXT: .LBB93_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
@@ -4742,13 +5116,15 @@
; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB98_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -4766,15 +5142,16 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB98_2
; GFX1250-SDAG-NEXT: .LBB98_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
@@ -4786,15 +5163,16 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB98_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -4806,21 +5184,24 @@
; GFX1250-GISEL-NEXT: s_branch .LBB98_5
; GFX1250-GISEL-NEXT: .LBB98_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB98_2
; GFX1250-GISEL-NEXT: .LBB98_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
@@ -4843,11 +5224,13 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB99_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -4865,14 +5248,16 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB99_2
; GFX1250-SDAG-NEXT: .LBB99_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
@@ -4884,18 +5269,19 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB99_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -4907,21 +5293,24 @@
; GFX1250-GISEL-NEXT: s_branch .LBB99_5
; GFX1250-GISEL-NEXT: .LBB99_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB99_2
; GFX1250-GISEL-NEXT: .LBB99_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
@@ -4941,11 +5330,13 @@
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB100_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -4961,14 +5352,15 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB100_2
; GFX1250-SDAG-NEXT: .LBB100_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
@@ -4977,13 +5369,14 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB100_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -4993,20 +5386,23 @@
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB100_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB100_2
; GFX1250-GISEL-NEXT: .LBB100_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
@@ -5025,10 +5421,12 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB101_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -5044,13 +5442,15 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB101_2
; GFX1250-SDAG-NEXT: .LBB101_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
@@ -5059,16 +5459,17 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB101_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -5078,20 +5479,23 @@
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB101_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB101_2
; GFX1250-GISEL-NEXT: .LBB101_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
@@ -5161,13 +5565,15 @@
; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB106_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -5185,10 +5591,12 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB106_2
; GFX1250-SDAG-NEXT: .LBB106_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s0, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
@@ -5207,15 +5615,16 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB106_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -5227,15 +5636,18 @@
; GFX1250-GISEL-NEXT: s_branch .LBB106_5
; GFX1250-GISEL-NEXT: .LBB106_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB106_2
; GFX1250-GISEL-NEXT: .LBB106_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
@@ -5265,11 +5677,13 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB107_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -5287,9 +5701,12 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB107_2
; GFX1250-SDAG-NEXT: .LBB107_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s0, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
@@ -5308,18 +5725,19 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB107_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -5331,15 +5749,18 @@
; GFX1250-GISEL-NEXT: s_branch .LBB107_5
; GFX1250-GISEL-NEXT: .LBB107_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB107_2
; GFX1250-GISEL-NEXT: .LBB107_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
@@ -5366,11 +5787,13 @@
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB108_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -5386,9 +5809,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB108_2
; GFX1250-SDAG-NEXT: .LBB108_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
@@ -5404,13 +5829,14 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB108_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -5420,14 +5846,17 @@
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB108_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB108_2
; GFX1250-GISEL-NEXT: .LBB108_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
@@ -5453,10 +5882,12 @@
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB109_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -5472,8 +5903,11 @@
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB109_2
; GFX1250-SDAG-NEXT: .LBB109_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
@@ -5489,16 +5923,17 @@
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB109_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -5508,14 +5943,17 @@
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB109_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB109_2
; GFX1250-GISEL-NEXT: .LBB109_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
index 735720a..725d57d 100644
--- a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
@@ -285,7 +285,7 @@
; GCN-LABEL: flat_store_b32_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b32_e32 v1, 1.0
-; GCN-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset
+; GCN-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SE
; GCN-NEXT: s_endpgm
entry:
%idxprom = sext i32 %idx to i64
@@ -298,7 +298,7 @@
; GCN-LABEL: flat_store_b16_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b32_e32 v1, 1
-; GCN-NEXT: flat_store_b16 v0, v1, s[0:1] scale_offset
+; GCN-NEXT: flat_store_b16 v0, v1, s[0:1] scale_offset scope:SCOPE_SE
; GCN-NEXT: s_endpgm
entry:
%idxprom = sext i32 %idx to i64
@@ -311,7 +311,7 @@
; GCN-LABEL: flat_store_b64_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0
-; GCN-NEXT: flat_store_b64 v0, v[2:3], s[0:1] scale_offset
+; GCN-NEXT: flat_store_b64 v0, v[2:3], s[0:1] scale_offset scope:SCOPE_SE
; GCN-NEXT: s_endpgm
entry:
%idxprom = sext i32 %idx to i64
@@ -337,12 +337,15 @@
; SDAG-LABEL: flat_atomicrmw_b64_rtn_idxprom:
; SDAG: ; %bb.0: ; %entry
; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 3, s[0:1]
-; SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; SDAG-NEXT: s_mov_b32 s0, exec_lo
+; SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; SDAG-NEXT: v_xor_b32_e32 v0, s0, v3
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; SDAG-NEXT: s_cbranch_execnz .LBB21_3
; SDAG-NEXT: ; %bb.1: ; %Flow
@@ -360,13 +363,16 @@
; SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; SDAG-NEXT: s_cbranch_execz .LBB21_2
; SDAG-NEXT: .LBB21_4: ; %atomicrmw.private
+; SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc_lo
; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; SDAG-NEXT: s_wait_loadcnt 0x0
; SDAG-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
-; SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
+; SDAG-NEXT: scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE
; SDAG-NEXT: s_wait_xcnt 0x0
; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; SDAG-NEXT: s_branch .LBB21_5
@@ -374,19 +380,21 @@
;
; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom:
; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi
; GISEL-NEXT: v_mov_b32_e32 v2, v0
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
-; GISEL-NEXT: s_mov_b64 s[2:3], src_private_base
-; GISEL-NEXT: s_mov_b32 s2, exec_lo
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[2:3]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v0
; GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v1, vcc_lo
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT: v_xor_b32_e32 v0, s2, v5
+; GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-NEXT: v_cmpx_ne_u32_e64 s3, v5
+; GISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: s_xor_b32 s2, exec_lo, s2
; GISEL-NEXT: s_cbranch_execnz .LBB21_3
; GISEL-NEXT: ; %bb.1: ; %Flow
@@ -398,19 +406,22 @@
; GISEL-NEXT: s_branch .LBB21_5
; GISEL-NEXT: .LBB21_3: ; %atomicrmw.global
; GISEL-NEXT: v_mov_b64_e32 v[0:1], 1
-; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT: ; implicit-def: $vgpr4
; GISEL-NEXT: flat_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GISEL-NEXT: s_wait_xcnt 0x0
; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2
; GISEL-NEXT: s_cbranch_execz .LBB21_2
; GISEL-NEXT: .LBB21_4: ; %atomicrmw.private
+; GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GISEL-NEXT: scratch_load_b64 v[0:1], v4, off
; GISEL-NEXT: s_wait_loadcnt 0x0
; GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
-; GISEL-NEXT: scratch_store_b64 v4, v[2:3], off
+; GISEL-NEXT: scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE
; GISEL-NEXT: s_wait_xcnt 0x0
; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GISEL-NEXT: s_branch .LBB21_5