Revert "[RegAlloc] Fix the terminal rule check for interfere with DstReg (#168661)" This reverts commit 0859ac5866a0228f5607dd329f83f4a9622dedcc. This caused a couple test failures, likely due to a mid-air collision. Reverting for now to get the tree back to green and allow the original author to run UTC/friends and verify the output.
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index e624088..25c4375 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -4150,7 +4150,7 @@ continue; Register OtherSrcReg, OtherReg; unsigned OtherSrcSubReg = 0, OtherSubReg = 0; - if (!isMoveInstr(*TRI, &MI, OtherSrcReg, OtherReg, OtherSrcSubReg, + if (!isMoveInstr(*TRI, &Copy, OtherSrcReg, OtherReg, OtherSrcSubReg, OtherSubReg)) return false; if (OtherReg == SrcReg)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll index 99c5403..4894932 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -803,20 +803,20 @@ ; CHECK-SD-NEXT: smlal2 v4.2d, v16.4s, v20.4s ; CHECK-SD-NEXT: smlal v6.2d, v16.2s, v20.2s ; CHECK-SD-NEXT: smlal v3.2d, v16.2s, v19.2s -; CHECK-SD-NEXT: smlal2 v0.2d, v16.4s, v18.4s +; CHECK-SD-NEXT: smlal2 v1.2d, v16.4s, v18.4s ; CHECK-SD-NEXT: smlal v7.2d, v16.2s, v17.2s -; CHECK-SD-NEXT: smlal v1.2d, v16.2s, v18.2s +; CHECK-SD-NEXT: smlal v0.2d, v16.2s, v18.2s ; CHECK-SD-NEXT: smlal2 v5.2d, v16.4s, v17.4s ; CHECK-SD-NEXT: b.ne .LBB6_7 ; CHECK-SD-NEXT: // %bb.8: // %middle.block -; CHECK-SD-NEXT: add v1.2d, v1.2d, v6.2d +; CHECK-SD-NEXT: add v0.2d, v0.2d, v6.2d ; CHECK-SD-NEXT: add v3.2d, v3.2d, v7.2d ; CHECK-SD-NEXT: cmp x10, x9 -; CHECK-SD-NEXT: add v0.2d, v0.2d, v4.2d +; CHECK-SD-NEXT: add v1.2d, v1.2d, v4.2d ; CHECK-SD-NEXT: add v2.2d, v2.2d, v5.2d -; CHECK-SD-NEXT: add v1.2d, v1.2d, v3.2d -; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d -; CHECK-SD-NEXT: add v0.2d, v1.2d, v0.2d +; CHECK-SD-NEXT: add v0.2d, v0.2d, v3.2d +; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d +; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d ; CHECK-SD-NEXT: addp d0, v0.2d ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: b.eq .LBB6_15
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll index a4f2090..7542e9c 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
@@ -35,15 +35,15 @@ ; CHECK-LABEL: check_deinterleaving_has_deinterleave: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: add x8, x0, #16 -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: movi v4.2d, #0000000000000000 -; CHECK-NEXT: mov w9, #32 // =0x20 ; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: mov w9, #32 // =0x20 +; CHECK-NEXT: movi v4.2d, #0000000000000000 ; CHECK-NEXT: movi v5.2d, #0000000000000000 -; CHECK-NEXT: movi v6.2d, #0000000000000000 ; CHECK-NEXT: movi v7.2d, #0000000000000000 +; CHECK-NEXT: movi v6.2d, #0000000000000000 ; CHECK-NEXT: movi v16.2d, #0000000000000000 ; CHECK-NEXT: .LBB1_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 @@ -64,31 +64,31 @@ ; CHECK-NEXT: ushll v24.4s, v18.4h, #0 ; CHECK-NEXT: ushll2 v18.4s, v18.8h, #0 ; CHECK-NEXT: ushll v20.4s, v20.4h, #0 -; CHECK-NEXT: and v21.16b, v21.16b, v2.16b -; CHECK-NEXT: and v19.16b, v19.16b, v2.16b -; CHECK-NEXT: and v22.16b, v22.16b, v2.16b -; CHECK-NEXT: and v17.16b, v17.16b, v2.16b -; CHECK-NEXT: and v23.16b, v23.16b, v2.16b -; CHECK-NEXT: and v24.16b, v24.16b, v2.16b -; CHECK-NEXT: and v18.16b, v18.16b, v2.16b -; CHECK-NEXT: and v20.16b, v20.16b, v2.16b -; CHECK-NEXT: add v5.4s, v5.4s, v19.4s -; CHECK-NEXT: add v3.4s, v3.4s, v21.4s -; CHECK-NEXT: add v1.4s, v1.4s, v22.4s -; CHECK-NEXT: add v4.4s, v4.4s, v17.4s +; CHECK-NEXT: and v21.16b, v21.16b, v1.16b +; CHECK-NEXT: and v19.16b, v19.16b, v1.16b +; CHECK-NEXT: and v22.16b, v22.16b, v1.16b +; CHECK-NEXT: and v17.16b, v17.16b, v1.16b +; CHECK-NEXT: and v23.16b, v23.16b, v1.16b +; CHECK-NEXT: and v24.16b, v24.16b, v1.16b +; CHECK-NEXT: and v18.16b, v18.16b, v1.16b +; CHECK-NEXT: and v20.16b, v20.16b, v1.16b +; CHECK-NEXT: add v4.4s, v4.4s, v19.4s +; CHECK-NEXT: add v2.4s, v2.4s, v21.4s +; CHECK-NEXT: add v0.4s, v0.4s, v22.4s +; CHECK-NEXT: add v3.4s, v3.4s, v17.4s ; CHECK-NEXT: add v16.4s, v16.4s, v23.4s -; CHECK-NEXT: add v6.4s, v6.4s, v24.4s -; CHECK-NEXT: add v7.4s, v7.4s, v20.4s -; CHECK-NEXT: add v0.4s, v0.4s, v18.4s +; CHECK-NEXT: add v5.4s, v5.4s, v24.4s +; CHECK-NEXT: add v6.4s, v6.4s, v20.4s +; CHECK-NEXT: add v7.4s, v7.4s, v18.4s ; CHECK-NEXT: b.ne .LBB1_1 ; CHECK-NEXT: // %bb.2: // %middle.block -; CHECK-NEXT: add v0.4s, v0.4s, v4.4s -; CHECK-NEXT: add v2.4s, v16.4s, v5.4s -; CHECK-NEXT: add v1.4s, v6.4s, v1.4s -; CHECK-NEXT: add v3.4s, v7.4s, v3.4s -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v1.4s, v7.4s, v3.4s +; CHECK-NEXT: add v3.4s, v16.4s, v4.4s +; CHECK-NEXT: add v0.4s, v5.4s, v0.4s +; CHECK-NEXT: add v2.4s, v6.4s, v2.4s ; CHECK-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll index ddeeca7..4f00aed 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -31,14 +31,14 @@ ; CHECK-NEXT: ldr z5, [x1] ; CHECK-NEXT: add x1, x1, x10 ; CHECK-NEXT: add x0, x0, x10 -; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0 -; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #0 -; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #90 -; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #90 +; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0 +; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0 +; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #90 +; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit.block -; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d -; CHECK-NEXT: uzp2 z1.d, z1.d, z0.d +; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z1.d, z0.d, z1.d ; CHECK-NEXT: faddv d0, p0, z2.d ; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -205,20 +205,20 @@ ; CHECK-NEXT: ldr z18, [x1, #3, mul vl] ; CHECK-NEXT: ldr z19, [x1, #2, mul vl] ; CHECK-NEXT: add x1, x1, x10 -; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #0 -; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #0 +; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #0 +; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #0 ; CHECK-NEXT: fcmla z3.d, p0/m, z18.d, z6.d, #0 ; CHECK-NEXT: fcmla z2.d, p0/m, z19.d, z17.d, #0 -; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #90 -; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #90 +; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #90 +; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #90 ; CHECK-NEXT: fcmla z3.d, p0/m, z18.d, z6.d, #90 ; CHECK-NEXT: fcmla z2.d, p0/m, z19.d, z17.d, #90 ; CHECK-NEXT: b.ne .LBB2_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d -; CHECK-NEXT: uzp1 z5.d, z1.d, z0.d +; CHECK-NEXT: uzp1 z5.d, z0.d, z1.d ; CHECK-NEXT: uzp2 z2.d, z2.d, z3.d -; CHECK-NEXT: uzp2 z0.d, z1.d, z0.d +; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d ; CHECK-NEXT: fadd z1.d, z4.d, z5.d ; CHECK-NEXT: fadd z2.d, z2.d, z0.d ; CHECK-NEXT: faddv d0, p0, z1.d
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll index 355adec..aed3072 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
@@ -25,14 +25,14 @@ ; CHECK-NEXT: ldp q3, q2, [x9] ; CHECK-NEXT: cmp x8, #1600 ; CHECK-NEXT: ldp q5, q4, [x10] -; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #0 -; CHECK-NEXT: fcmla v0.2d, v4.2d, v2.2d, #0 -; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #90 -; CHECK-NEXT: fcmla v0.2d, v4.2d, v2.2d, #90 +; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #0 +; CHECK-NEXT: fcmla v1.2d, v4.2d, v2.2d, #0 +; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #90 +; CHECK-NEXT: fcmla v1.2d, v4.2d, v2.2d, #90 ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %middle.block -; CHECK-NEXT: zip2 v2.2d, v1.2d, v0.2d -; CHECK-NEXT: zip1 v0.2d, v1.2d, v0.2d +; CHECK-NEXT: zip2 v2.2d, v0.2d, v1.2d +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: faddp d0, v0.2d ; CHECK-NEXT: faddp d1, v2.2d ; CHECK-NEXT: ret @@ -159,20 +159,20 @@ ; CHECK-NEXT: ldp q17, q16, [x8], #64 ; CHECK-NEXT: ldp q19, q18, [x9], #64 ; CHECK-NEXT: fcmla v2.2d, v7.2d, v5.2d, #0 -; CHECK-NEXT: fcmla v1.2d, v6.2d, v4.2d, #0 -; CHECK-NEXT: fcmla v0.2d, v19.2d, v17.2d, #0 +; CHECK-NEXT: fcmla v0.2d, v6.2d, v4.2d, #0 +; CHECK-NEXT: fcmla v1.2d, v19.2d, v17.2d, #0 ; CHECK-NEXT: fcmla v3.2d, v18.2d, v16.2d, #0 ; CHECK-NEXT: fcmla v2.2d, v7.2d, v5.2d, #90 -; CHECK-NEXT: fcmla v1.2d, v6.2d, v4.2d, #90 -; CHECK-NEXT: fcmla v0.2d, v19.2d, v17.2d, #90 +; CHECK-NEXT: fcmla v0.2d, v6.2d, v4.2d, #90 +; CHECK-NEXT: fcmla v1.2d, v19.2d, v17.2d, #90 ; CHECK-NEXT: fcmla v3.2d, v18.2d, v16.2d, #90 ; CHECK-NEXT: b.ne .LBB2_1 ; CHECK-NEXT: // %bb.2: // %middle.block -; CHECK-NEXT: zip2 v4.2d, v0.2d, v3.2d -; CHECK-NEXT: zip1 v0.2d, v0.2d, v3.2d -; CHECK-NEXT: zip2 v3.2d, v2.2d, v1.2d -; CHECK-NEXT: zip1 v1.2d, v2.2d, v1.2d -; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v4.2d, v1.2d, v3.2d +; CHECK-NEXT: zip1 v1.2d, v1.2d, v3.2d +; CHECK-NEXT: zip2 v3.2d, v2.2d, v0.2d +; CHECK-NEXT: zip1 v0.2d, v2.2d, v0.2d +; CHECK-NEXT: fadd v0.2d, v1.2d, v0.2d ; CHECK-NEXT: fadd v1.2d, v4.2d, v3.2d ; CHECK-NEXT: faddp d0, v0.2d ; CHECK-NEXT: faddp d1, v1.2d
diff --git a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll index 0fe4683..3380842 100644 --- a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll +++ b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll
@@ -16,9 +16,8 @@ ; CHECK-NEXT: mov w9, wzr ; CHECK-NEXT: LBB0_1: ; %.thread ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lsr w11, w9, #1 ; CHECK-NEXT: sub w10, w9, #1 -; CHECK-NEXT: mov w9, w11 +; CHECK-NEXT: lsr w9, w9, #1 ; CHECK-NEXT: tbnz w10, #0, LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %bb343 ; CHECK-NEXT: and w9, w10, #0x1
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll index 6c6a691..52a77cb 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
@@ -147,15 +147,15 @@ define <4 x i1> @extract_v4i1_nxv32i1_0(<vscale x 32 x i1> %arg) { ; CHECK-LABEL: extract_v4i1_nxv32i1_0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 -; CHECK-NEXT: umov w8, v1.b[1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: umov w9, v1.b[2] +; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 +; CHECK-NEXT: umov w8, v0.b[1] +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: umov w8, v1.b[2] +; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: umov w8, v1.b[3] -; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %ext = call <4 x i1> @llvm.vector.extract.v4i1.nxv32i1(<vscale x 32 x i1> %arg, i64 0) ret <4 x i1> %ext
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll index 1cefe96..7299410 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
@@ -248,15 +248,15 @@ define <4 x i1> @extract_v4i1_nxv4i1(<vscale x 4 x i1> %inmask) { ; CHECK-LABEL: extract_v4i1_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, v1.s[2] +; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: mov w8, v1.s[3] -; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %mask = call <4 x i1> @llvm.vector.extract.v4i1.nxv4i1(<vscale x 4 x i1> %inmask, i64 0) ret <4 x i1> %mask @@ -265,23 +265,23 @@ define <8 x i1> @extract_v8i1_nxv8i1(<vscale x 8 x i1> %inmask) { ; CHECK-LABEL: extract_v8i1_nxv8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1 -; CHECK-NEXT: umov w8, v1.h[1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: umov w9, v1.h[2] +; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v0.b[1], w8 +; CHECK-NEXT: umov w8, v1.h[2] +; CHECK-NEXT: mov v0.b[2], w8 ; CHECK-NEXT: umov w8, v1.h[3] -; CHECK-NEXT: mov v0.b[2], w9 -; CHECK-NEXT: umov w9, v1.h[4] ; CHECK-NEXT: mov v0.b[3], w8 +; CHECK-NEXT: umov w8, v1.h[4] +; CHECK-NEXT: mov v0.b[4], w8 ; CHECK-NEXT: umov w8, v1.h[5] -; CHECK-NEXT: mov v0.b[4], w9 -; CHECK-NEXT: umov w9, v1.h[6] ; CHECK-NEXT: mov v0.b[5], w8 +; CHECK-NEXT: umov w8, v1.h[6] +; CHECK-NEXT: mov v0.b[6], w8 ; CHECK-NEXT: umov w8, v1.h[7] -; CHECK-NEXT: mov v0.b[6], w9 ; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %mask = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> %inmask, i64 0) ret <8 x i1> %mask
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll index 41e4a38..8e807cd 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll
@@ -8,15 +8,15 @@ define <4 x i1> @reshuffle_v4i1_nxv4i1(<vscale x 4 x i1> %a) #0 { ; CHECK-LABEL: reshuffle_v4i1_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, v1.s[2] +; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: mov w8, v1.s[3] -; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %el0 = extractelement <vscale x 4 x i1> %a, i32 0 %el1 = extractelement <vscale x 4 x i1> %a, i32 1
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll index 74a717f..935189d 100644 --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -2835,11 +2835,11 @@ ; CHECK-BE-NEXT: .LBB24_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v0.16b }, [x1], #16 -; CHECK-BE-NEXT: add x8, x0, #16 +; CHECK-BE-NEXT: mov x8, x0 ; CHECK-BE-NEXT: ld1 { v1.8h }, [x0] -; CHECK-BE-NEXT: ld1 { v3.8h }, [x8] -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: add x10, x0, #32 +; CHECK-BE-NEXT: add x0, x0, #16 +; CHECK-BE-NEXT: add x9, x8, #48 +; CHECK-BE-NEXT: ld1 { v3.8h }, [x0] ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: ushll v2.8h, v0.8b, #0 ; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0 @@ -2847,11 +2847,11 @@ ; CHECK-BE-NEXT: umull2 v5.4s, v3.8h, v0.8h ; CHECK-BE-NEXT: umull v0.4s, v3.4h, v0.4h ; CHECK-BE-NEXT: umull2 v1.4s, v1.8h, v2.8h -; CHECK-BE-NEXT: st1 { v4.4s }, [x0] -; CHECK-BE-NEXT: mov x0, x8 +; CHECK-BE-NEXT: st1 { v4.4s }, [x8] +; CHECK-BE-NEXT: add x8, x8, #32 ; CHECK-BE-NEXT: st1 { v5.4s }, [x9] -; CHECK-BE-NEXT: st1 { v0.4s }, [x10] -; CHECK-BE-NEXT: st1 { v1.4s }, [x8] +; CHECK-BE-NEXT: st1 { v0.4s }, [x8] +; CHECK-BE-NEXT: st1 { v1.4s }, [x0] ; CHECK-BE-NEXT: b.ne .LBB24_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr @@ -2950,26 +2950,26 @@ ; CHECK-BE-NEXT: .LBB25_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v4.16b }, [x0] -; CHECK-BE-NEXT: add x9, x1, #48 -; CHECK-BE-NEXT: add x8, x1, #32 -; CHECK-BE-NEXT: ld1 { v18.4s }, [x9] +; CHECK-BE-NEXT: add x10, x1, #48 ; CHECK-BE-NEXT: ld1 { v16.4s }, [x1] +; CHECK-BE-NEXT: add x9, x1, #32 +; CHECK-BE-NEXT: ld1 { v18.4s }, [x10] ; CHECK-BE-NEXT: add x1, x1, #16 -; CHECK-BE-NEXT: ld1 { v20.4s }, [x8] +; CHECK-BE-NEXT: ld1 { v20.4s }, [x9] ; CHECK-BE-NEXT: ld1 { v22.4s }, [x1] -; CHECK-BE-NEXT: add x8, x0, #96 +; CHECK-BE-NEXT: add x9, x0, #96 ; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v3.16b ; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v2.16b ; CHECK-BE-NEXT: tbl v7.16b, { v4.16b }, v1.16b ; CHECK-BE-NEXT: tbl v4.16b, { v4.16b }, v0.16b ; CHECK-BE-NEXT: ext v24.16b, v18.16b, v18.16b, #8 -; CHECK-BE-NEXT: add x9, x0, #32 +; CHECK-BE-NEXT: mov x8, x0 ; CHECK-BE-NEXT: ext v25.16b, v20.16b, v20.16b, #8 -; CHECK-BE-NEXT: add x10, x0, #16 +; CHECK-BE-NEXT: add x10, x0, #32 ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: ext v17.16b, v5.16b, v5.16b, #8 -; CHECK-BE-NEXT: ext v19.16b, v6.16b, v6.16b, #8 ; CHECK-BE-NEXT: rev32 v5.8b, v5.8b +; CHECK-BE-NEXT: ext v19.16b, v6.16b, v6.16b, #8 ; CHECK-BE-NEXT: rev32 v21.8b, v7.8b ; CHECK-BE-NEXT: rev32 v23.8b, v4.8b ; CHECK-BE-NEXT: ext v7.16b, v7.16b, v7.16b, #8 @@ -2986,22 +2986,22 @@ ; CHECK-BE-NEXT: rev32 v4.8b, v4.8b ; CHECK-BE-NEXT: umull v17.2d, v17.2s, v24.2s ; CHECK-BE-NEXT: umull v19.2d, v19.2s, v25.2s -; CHECK-BE-NEXT: st1 { v5.2d }, [x8] +; CHECK-BE-NEXT: st1 { v5.2d }, [x9] ; CHECK-BE-NEXT: umull v5.2d, v6.2s, v20.2s ; CHECK-BE-NEXT: umull v6.2d, v7.2s, v21.2s -; CHECK-BE-NEXT: add x8, x0, #112 +; CHECK-BE-NEXT: add x9, x0, #112 ; CHECK-BE-NEXT: umull v4.2d, v4.2s, v16.2s -; CHECK-BE-NEXT: st1 { v18.2d }, [x9] -; CHECK-BE-NEXT: add x9, x0, #80 +; CHECK-BE-NEXT: st1 { v18.2d }, [x10] +; CHECK-BE-NEXT: add x10, x0, #80 ; CHECK-BE-NEXT: st1 { v22.2d }, [x0] -; CHECK-BE-NEXT: st1 { v17.2d }, [x8] -; CHECK-BE-NEXT: add x8, x0, #64 -; CHECK-BE-NEXT: st1 { v19.2d }, [x9] -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: mov x0, x8 -; CHECK-BE-NEXT: st1 { v5.2d }, [x8] +; CHECK-BE-NEXT: add x0, x0, #64 +; CHECK-BE-NEXT: st1 { v17.2d }, [x9] +; CHECK-BE-NEXT: add x9, x8, #48 +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: st1 { v19.2d }, [x10] +; CHECK-BE-NEXT: st1 { v5.2d }, [x0] ; CHECK-BE-NEXT: st1 { v6.2d }, [x9] -; CHECK-BE-NEXT: st1 { v4.2d }, [x10] +; CHECK-BE-NEXT: st1 { v4.2d }, [x8] ; CHECK-BE-NEXT: b.ne .LBB25_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr @@ -3093,13 +3093,14 @@ ; CHECK-BE-NEXT: .LBB26_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v4.16b }, [x1], #16 -; CHECK-BE-NEXT: add x8, x0, #32 +; CHECK-BE-NEXT: mov x8, x0 +; CHECK-BE-NEXT: add x9, x0, #32 ; CHECK-BE-NEXT: ld1 { v16.4s }, [x0] -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: add x10, x0, #16 -; CHECK-BE-NEXT: ld1 { v17.4s }, [x8] -; CHECK-BE-NEXT: ld1 { v18.4s }, [x9] -; CHECK-BE-NEXT: ld1 { v19.4s }, [x10] +; CHECK-BE-NEXT: add x10, x0, #48 +; CHECK-BE-NEXT: add x0, x0, #16 +; CHECK-BE-NEXT: ld1 { v17.4s }, [x9] +; CHECK-BE-NEXT: ld1 { v18.4s }, [x10] +; CHECK-BE-NEXT: ld1 { v19.4s }, [x0] ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v1.16b ; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v3.16b @@ -3113,11 +3114,10 @@ ; CHECK-BE-NEXT: mul v6.4s, v17.4s, v6.4s ; CHECK-BE-NEXT: mul v7.4s, v18.4s, v7.4s ; CHECK-BE-NEXT: mul v4.4s, v19.4s, v4.4s -; CHECK-BE-NEXT: st1 { v5.4s }, [x0] -; CHECK-BE-NEXT: mov x0, x10 -; CHECK-BE-NEXT: st1 { v6.4s }, [x8] -; CHECK-BE-NEXT: st1 { v7.4s }, [x9] -; CHECK-BE-NEXT: st1 { v4.4s }, [x10] +; CHECK-BE-NEXT: st1 { v5.4s }, [x8] +; CHECK-BE-NEXT: st1 { v6.4s }, [x9] +; CHECK-BE-NEXT: st1 { v7.4s }, [x10] +; CHECK-BE-NEXT: st1 { v4.4s }, [x0] ; CHECK-BE-NEXT: b.ne .LBB26_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr @@ -3246,11 +3246,11 @@ ; CHECK-BE-NEXT: .LBB28_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v0.16b }, [x1], #16 -; CHECK-BE-NEXT: add x8, x0, #16 +; CHECK-BE-NEXT: mov x8, x0 ; CHECK-BE-NEXT: ld1 { v1.8h }, [x0] -; CHECK-BE-NEXT: ld1 { v3.8h }, [x8] -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: add x10, x0, #32 +; CHECK-BE-NEXT: add x0, x0, #16 +; CHECK-BE-NEXT: add x9, x8, #48 +; CHECK-BE-NEXT: ld1 { v3.8h }, [x0] ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: ushll v2.8h, v0.8b, #0 ; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0 @@ -3258,11 +3258,11 @@ ; CHECK-BE-NEXT: smull2 v5.4s, v3.8h, v0.8h ; CHECK-BE-NEXT: smull v0.4s, v3.4h, v0.4h ; CHECK-BE-NEXT: smull2 v1.4s, v1.8h, v2.8h -; CHECK-BE-NEXT: st1 { v4.4s }, [x0] -; CHECK-BE-NEXT: mov x0, x8 +; CHECK-BE-NEXT: st1 { v4.4s }, [x8] +; CHECK-BE-NEXT: add x8, x8, #32 ; CHECK-BE-NEXT: st1 { v5.4s }, [x9] -; CHECK-BE-NEXT: st1 { v0.4s }, [x10] -; CHECK-BE-NEXT: st1 { v1.4s }, [x8] +; CHECK-BE-NEXT: st1 { v0.4s }, [x8] +; CHECK-BE-NEXT: st1 { v1.4s }, [x0] ; CHECK-BE-NEXT: b.ne .LBB28_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll index 8372d22..c1e6b4f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -21,14 +21,14 @@ ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX10-NEXT: s_mov_b32 s8, exec_lo +; GFX10-NEXT: s_mov_b32 s9, s5 ; GFX10-NEXT: s_add_i32 s6, s6, 1 -; GFX10-NEXT: s_xor_b32 s8, s5, s8 +; GFX10-NEXT: s_xor_b32 s5, s5, s8 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v0 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo -; GFX10-NEXT: s_and_b32 s9, exec_lo, s5 -; GFX10-NEXT: s_mov_b32 s5, s8 -; GFX10-NEXT: s_or_b32 s7, s7, s9 +; GFX10-NEXT: s_and_b32 s8, exec_lo, s9 +; GFX10-NEXT: s_or_b32 s7, s7, s8 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %exit
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll index 7bd1ff2..9a90faf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -78,12 +78,13 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[2:3], v1, s[2:3] -; GFX11-NEXT: global_load_b32 v6, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v5, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v6, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v5, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v3, v6, v[1:2] -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v5, v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -126,13 +127,14 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v6, v1, s[2:3] +; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] ; GFX11-NEXT: global_load_b64 v[2:3], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v6, v3, v[1:2] -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -222,13 +224,14 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v6, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] ; GFX11-NEXT: global_load_b64 v[2:3], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v6, v3, v[1:2] -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -520,28 +523,28 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[2:3], v0, s[2:3] -; GFX11-NEXT: global_load_b64 v[4:5], v0, s[4:5] +; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[5:6], v0, s[4:5] ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[2:3] +; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[3:4] ; GFX11-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %else ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v5, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v2, v5, v[1:2] -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX11-NEXT: v_mov_b32_e32 v1, v3 -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v6, v[4:5] +; GFX11-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX11-NEXT: .LBB10_2: ; %Flow ; GFX11-NEXT: s_and_not1_saveexec_b32 s2, s2 ; GFX11-NEXT: s_cbranch_execz .LBB10_4 ; GFX11-NEXT: ; %bb.3: ; %if ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mul_lo_u32 v1, v2, v5 +; GFX11-NEXT: v_mul_lo_u32 v1, v3, v6 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: .LBB10_4: ; %endif ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index ba7fb1b..b12fa0a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -3142,8 +3142,8 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0 ; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v4 -; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v6, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v6, v[5:6] ; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; @@ -3154,8 +3154,8 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0 ; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v4 -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v7, v6, v[5:6] ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; @@ -3166,8 +3166,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0 ; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v4 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4] -; GFX9-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v7, v6, v[5:6] ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm ; @@ -3187,8 +3187,8 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v4, 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v4 -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, 0x50, v6, v[3:4] -; GFX11-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v5, v3 +; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, 0x50, v6, v[5:6] ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ;
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 50e28a7..88e3c86 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2147,12 +2147,12 @@ ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3] +; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s4, v2, s[2:3] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2] -; GFX1164-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, v4 +; GFX1164-NEXT: v_mad_u64_u32 v[4:5], null, s5, v2, v[0:1] +; GFX1164-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i64_uniform: @@ -2190,12 +2190,12 @@ ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3] +; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s4, v2, s[2:3] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2] -; GFX1132-NEXT: v_mov_b32_e32 v1, v3 -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: v_mov_b32_e32 v0, v4 +; GFX1132-NEXT: v_mad_u64_u32 v[4:5], null, s5, v2, v[0:1] +; GFX1132-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm ; ; GFX1264-LABEL: add_i64_uniform:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index a9938f1..0a098eb 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -1889,13 +1889,13 @@ ; GFX1164-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, s[4:5] ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] +; GFX1164-NEXT: v_mov_b32_e32 v0, v4 +; GFX1164-NEXT: v_mad_u64_u32 v[4:5], null, s3, v2, v[0:1] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i64_uniform: @@ -1926,13 +1926,13 @@ ; GFX1132-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, s[4:5] ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] +; GFX1132-NEXT: v_mov_b32_e32 v0, v4 +; GFX1132-NEXT: v_mad_u64_u32 v[4:5], null, s3, v2, v[0:1] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: v_mov_b32_e32 v1, v3 -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %additive acq_rel
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index d499b3d..b6eaaf1 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -326,12 +326,12 @@ ; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB2_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -347,12 +347,12 @@ ; GFX6-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB2_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -440,12 +440,12 @@ ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB3_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -462,12 +462,12 @@ ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB3_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -880,13 +880,14 @@ ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[3:4], 4.0, v[1:2] +; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[1:2], 4.0, v[3:4] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] +; GFX12-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -913,13 +914,14 @@ ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] +; GFX11-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -936,14 +938,14 @@ ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX10-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB6_1 @@ -968,13 +970,13 @@ ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 -; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 +; GFX908-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -990,13 +992,13 @@ ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 -; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 +; GFX8-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1012,13 +1014,13 @@ ; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 -; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 +; GFX7-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB6_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1034,13 +1036,13 @@ ; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 +; GFX6-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v2, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB6_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1063,13 +1065,14 @@ ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[3:4], 4.0, v[1:2] +; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[1:2], 4.0, v[3:4] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528 +; GFX12-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1096,13 +1099,14 @@ ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528 +; GFX11-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] offset:65528 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1119,14 +1123,14 @@ ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 +; GFX10-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB7_1 @@ -1151,13 +1155,13 @@ ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 -; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 +; GFX908-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1173,13 +1177,13 @@ ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 -; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 +; GFX8-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1195,13 +1199,13 @@ ; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 -; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 +; GFX7-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1218,13 +1222,13 @@ ; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f64 v[3:4], v[0:1], 4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v2, v[0:1], v[3:4] +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: v_add_f64 v[0:1], v[3:4], 4.0 +; GFX6-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[0:1] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1329,30 +1333,30 @@ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX942-NEXT: ds_read_b32 v3, v1 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX942-NEXT: ds_read_b32 v2, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_and_b32_e32 v0, 24, v2 -; GFX942-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX942-NEXT: v_not_b32_e32 v2, v2 +; GFX942-NEXT: v_and_b32_e32 v0, 24, v3 +; GFX942-NEXT: v_lshlrev_b32_e64 v3, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v3, v3 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX942-NEXT: v_add_f16_e32 v2, 4.0, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX942-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_f16: @@ -1463,30 +1467,30 @@ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX90A-NEXT: ds_read_b32 v3, v1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX90A-NEXT: ds_read_b32 v2, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_and_b32_e32 v0, 24, v2 -; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v2, s4 -; GFX90A-NEXT: v_not_b32_e32 v2, v2 +; GFX90A-NEXT: v_and_b32_e32 v0, 24, v3 +; GFX90A-NEXT: v_lshlrev_b32_e64 v3, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v3, v3 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX90A-NEXT: v_add_f16_e32 v2, 4.0, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_ret_f16: @@ -1717,30 +1721,30 @@ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_add_u32_e32 v0, 0xfffe, v0 ; GFX942-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX942-NEXT: ds_read_b32 v3, v1 +; GFX942-NEXT: ds_read_b32 v2, v1 ; GFX942-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v2, v0, s0 -; GFX942-NEXT: v_not_b32_e32 v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e64 v3, v0, s0 +; GFX942-NEXT: v_not_b32_e32 v3, v3 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX942-NEXT: v_add_f16_e32 v2, 4.0, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX942-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_f16__offset: @@ -1857,30 +1861,30 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v0, 0xfffe, v0 ; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX90A-NEXT: ds_read_b32 v3, v1 +; GFX90A-NEXT: ds_read_b32 v2, v1 ; GFX90A-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v0, s4 -; GFX90A-NEXT: v_not_b32_e32 v2, v2 +; GFX90A-NEXT: v_lshlrev_b32_e64 v3, v0, s4 +; GFX90A-NEXT: v_not_b32_e32 v3, v3 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX90A-NEXT: v_add_f16_e32 v2, 4.0, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_ret_f16__offset: @@ -2032,27 +2036,27 @@ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2073,28 +2077,28 @@ ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2119,15 +2123,15 @@ ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX942-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2140,27 +2144,27 @@ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2175,28 +2179,28 @@ ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2211,23 +2215,23 @@ ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v3, v3 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX10-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX10-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 @@ -2249,15 +2253,15 @@ ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX90A-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2278,15 +2282,15 @@ ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX908-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX908-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2308,16 +2312,16 @@ ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX8-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX8-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2338,18 +2342,18 @@ ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2370,18 +2374,18 @@ ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2412,19 +2416,19 @@ ; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2455,19 +2459,20 @@ ; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2493,15 +2498,15 @@ ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX942-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX942-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2524,19 +2529,19 @@ ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2561,19 +2566,20 @@ ; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2596,16 +2602,16 @@ ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX10-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX10-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 @@ -2628,15 +2634,15 @@ ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX90A-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX90A-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2658,15 +2664,15 @@ ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX908-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX908-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2689,16 +2695,16 @@ ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX8-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX8-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2720,18 +2726,18 @@ ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2753,18 +2759,18 @@ ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2846,19 +2852,19 @@ ; GFX942-LABEL: local_atomic_fadd_ret_f16__offset__align4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534 +; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_add_f16_e32 v1, 4.0, v2 ; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 ; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2949,19 +2955,19 @@ ; GFX90A-LABEL: local_atomic_fadd_ret_f16__offset__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534 +; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_add_f16_e32 v1, 4.0, v2 ; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3086,16 +3092,16 @@ ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -3118,16 +3124,17 @@ ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_add_f16_e32 v2, 4.0, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, 4.0, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -3147,13 +3154,13 @@ ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_add_f16_e32 v2, 4.0, v1 -; GFX942-NEXT: v_and_or_b32 v2, v1, s2, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_add_f16_e32 v1, 4.0, v2 +; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3168,16 +3175,16 @@ ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3194,16 +3201,17 @@ ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_f16_e32 v2, 4.0, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, 4.0, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3220,15 +3228,15 @@ ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_f16_e32 v2, 4.0, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_add_f16_e32 v1, 4.0, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 @@ -3245,13 +3253,13 @@ ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_add_f16_e32 v2, 4.0, v1 -; GFX90A-NEXT: v_and_or_b32 v2, v1, s6, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_add_f16_e32 v1, 4.0, v2 +; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3267,13 +3275,13 @@ ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_add_f16_e32 v2, 4.0, v1 -; GFX908-NEXT: v_and_or_b32 v2, v1, s6, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_add_f16_e32 v1, 4.0, v2 +; GFX908-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3289,14 +3297,14 @@ ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f16_e32 v2, 4.0, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_add_f16_e32 v1, 4.0, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3312,16 +3320,16 @@ ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3338,16 +3346,16 @@ ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3483,27 +3491,27 @@ ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_bf16: @@ -3650,25 +3658,25 @@ ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_ret_bf16: @@ -3942,27 +3950,27 @@ ; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_bf16__offset: @@ -4115,25 +4123,25 @@ ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_ret_bf16__offset: @@ -4297,38 +4305,38 @@ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -4349,37 +4357,37 @@ ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4405,22 +4413,22 @@ ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4433,38 +4441,38 @@ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4479,37 +4487,37 @@ ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4524,28 +4532,28 @@ ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v3, v3 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 @@ -4568,20 +4576,20 @@ ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4603,20 +4611,20 @@ ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4638,22 +4646,22 @@ ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4674,18 +4682,18 @@ ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4706,18 +4714,18 @@ ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4748,29 +4756,30 @@ ; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -4801,28 +4810,29 @@ ; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4849,22 +4859,22 @@ ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4888,28 +4898,29 @@ ; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4935,27 +4946,28 @@ ; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4978,21 +4990,21 @@ ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -5016,20 +5028,20 @@ ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5052,20 +5064,20 @@ ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5088,22 +5100,22 @@ ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5125,18 +5137,18 @@ ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5158,18 +5170,18 @@ ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5271,13 +5283,14 @@ ; GFX942-LABEL: local_atomic_fadd_ret_bf16__offset__align4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534 +; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 ; GFX942-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX942-NEXT: v_add_f32_e32 v1, 4.0, v1 ; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1 @@ -5292,7 +5305,6 @@ ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5409,13 +5421,14 @@ ; GFX90A-LABEL: local_atomic_fadd_ret_bf16__offset__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534 +; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1 ; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1 @@ -5429,7 +5442,6 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5569,26 +5581,27 @@ ; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -5611,25 +5624,26 @@ ; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -5650,21 +5664,21 @@ ; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX942-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX942-NEXT: v_add3_u32 v3, v3, v2, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX942-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX942-NEXT: v_add3_u32 v3, v3, v1, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX942-NEXT: v_and_or_b32 v2, v1, s3, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX942-NEXT: v_and_or_b32 v1, v2, s3, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB19_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5680,25 +5694,26 @@ ; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5716,24 +5731,25 @@ ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5750,21 +5766,21 @@ ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -5782,20 +5798,20 @@ ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v1, s7, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX90A-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5812,20 +5828,20 @@ ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX908-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v3, v3, v2, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: v_and_or_b32 v2, v1, s7, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX908-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5841,21 +5857,21 @@ ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5871,16 +5887,16 @@ ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5897,16 +5913,16 @@ ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5994,17 +6010,17 @@ ; GFX90A-LABEL: local_atomic_fadd_ret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6218,17 +6234,17 @@ ; GFX90A-LABEL: local_atomic_fadd_ret_v2f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6399,13 +6415,14 @@ ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v2, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v2, v3, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6422,13 +6439,13 @@ ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_pk_add_f16 v2, v3, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB22_1 @@ -6444,12 +6461,12 @@ ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v3, v2, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6464,12 +6481,12 @@ ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v3, v2, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_pk_add_f16 v2, v3, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6485,14 +6502,14 @@ ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v4, v2, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v4, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6614,13 +6631,14 @@ ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v2, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v2, v3, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6637,13 +6655,13 @@ ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_pk_add_f16 v2, v3, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 @@ -6659,12 +6677,12 @@ ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v3, v2, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6679,12 +6697,12 @@ ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v3, v2, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_pk_add_f16 v2, v3, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6700,14 +6718,14 @@ ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v4, v2, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v4, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6962,40 +6980,40 @@ ; GFX90A-LABEL: local_atomic_fadd_ret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 +; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_ret_v2bf16: @@ -7316,40 +7334,40 @@ ; GFX90A-LABEL: local_atomic_fadd_ret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 +; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_ret_v2bf16__offset: @@ -7547,30 +7565,32 @@ ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7591,30 +7611,32 @@ ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7634,27 +7656,27 @@ ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX10-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -7674,26 +7696,26 @@ ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7712,26 +7734,26 @@ ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX908-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7749,29 +7771,29 @@ ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7888,30 +7910,32 @@ ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7932,30 +7956,32 @@ ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7975,27 +8001,27 @@ ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX10-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -8015,26 +8041,26 @@ ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8053,26 +8079,26 @@ ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX908-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8090,29 +8116,29 @@ ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8849,20 +8875,20 @@ ; GFX7-NEXT: ; %bb.5: ; GFX7-NEXT: s_lshl_b32 s0, s3, 4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_read_b32 v3, v1 +; GFX7-NEXT: ds_read_b32 v2, v1 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_6: ; %atomicrmw.start2 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v4, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_6 ; GFX7-NEXT: .LBB28_7: ; %Flow21 @@ -8973,20 +8999,20 @@ ; GFX6-NEXT: ; %bb.5: ; GFX6-NEXT: s_lshl_b32 s0, s3, 4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_read_b32 v3, v1 +; GFX6-NEXT: ds_read_b32 v2, v1 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_6: ; %atomicrmw.start2 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v4, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_6 ; GFX6-NEXT: .LBB28_7: ; %Flow19 @@ -9677,20 +9703,20 @@ ; GFX7-NEXT: ; %bb.5: ; GFX7-NEXT: s_lshl_b32 s0, s3, 4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_read_b32 v3, v1 +; GFX7-NEXT: ds_read_b32 v2, v1 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_6: ; %atomicrmw.start2 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v4, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_6 ; GFX7-NEXT: .LBB29_7: ; %Flow21 @@ -9801,20 +9827,20 @@ ; GFX6-NEXT: ; %bb.5: ; GFX6-NEXT: s_lshl_b32 s0, s3, 4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_read_b32 v3, v1 +; GFX6-NEXT: ds_read_b32 v2, v1 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_6: ; %atomicrmw.start2 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v4, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_6 ; GFX6-NEXT: .LBB29_7: ; %Flow19 @@ -10084,12 +10110,12 @@ ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10105,12 +10131,12 @@ ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 282c754..8e094a7 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
@@ -886,21 +886,21 @@ ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX942-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX942-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX942-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_f16: @@ -1025,21 +1025,21 @@ ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX90A-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_f16: @@ -1285,21 +1285,21 @@ ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX942-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX942-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX942-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_f16__offset: @@ -1430,21 +1430,21 @@ ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX90A-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_f16__offset: @@ -1598,29 +1598,29 @@ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -1641,29 +1641,29 @@ ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -1688,16 +1688,16 @@ ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX942-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX942-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX942-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1710,29 +1710,29 @@ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1747,29 +1747,29 @@ ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1784,24 +1784,24 @@ ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v3, v3 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX10-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 @@ -1823,16 +1823,16 @@ ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX90A-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1853,16 +1853,16 @@ ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX908-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1884,17 +1884,17 @@ ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1915,18 +1915,18 @@ ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1947,18 +1947,18 @@ ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1989,20 +1989,21 @@ ; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2033,21 +2034,21 @@ ; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2073,16 +2074,16 @@ ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX942-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX942-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX942-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2105,20 +2106,21 @@ ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2143,21 +2145,21 @@ ; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2180,17 +2182,17 @@ ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX10-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 @@ -2213,16 +2215,16 @@ ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX90A-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2244,16 +2246,16 @@ ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX908-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2276,17 +2278,17 @@ ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2308,18 +2310,18 @@ ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2341,18 +2343,18 @@ ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2437,12 +2439,13 @@ ; GFX942-LABEL: local_atomic_fmax_ret_f16__offset__align4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534 +; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX942-NEXT: v_max_f16_e32 v1, 4.0, v1 ; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 @@ -2450,7 +2453,6 @@ ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2545,12 +2547,13 @@ ; GFX90A-LABEL: local_atomic_fmax_ret_f16__offset__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534 +; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v1, 4.0, v1 ; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 @@ -2558,7 +2561,6 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2685,17 +2687,18 @@ ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, 4.0, v2.l -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, 4.0, v1.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2718,18 +2721,18 @@ ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v1, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, 4.0, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v2, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, 4.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2749,14 +2752,14 @@ ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX942-NEXT: v_max_f16_e32 v2, 4.0, v2 -; GFX942-NEXT: v_and_or_b32 v2, v1, s2, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v1, 4.0, v1 +; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2771,17 +2774,18 @@ ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, 4.0, v2.l -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, 4.0, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2798,18 +2802,18 @@ ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, 4.0, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, 4.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2826,16 +2830,16 @@ ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX10-NEXT: v_max_f16_e32 v2, 4.0, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v1, 4.0, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 @@ -2852,14 +2856,14 @@ ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX90A-NEXT: v_max_f16_e32 v2, 4.0, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v1, s6, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v1, 4.0, v1 +; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2875,14 +2879,14 @@ ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX908-NEXT: v_max_f16_e32 v2, 4.0, v2 -; GFX908-NEXT: v_and_or_b32 v2, v1, s6, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v1, 4.0, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2898,15 +2902,15 @@ ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_max_f16_e32 v2, 4.0, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_max_f16_e32 v1, 4.0, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2922,16 +2926,16 @@ ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX7-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2948,16 +2952,16 @@ ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX6-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3093,27 +3097,27 @@ ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_bf16: @@ -3260,25 +3264,25 @@ ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_bf16: @@ -3554,27 +3558,27 @@ ; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_bf16__offset: @@ -3727,25 +3731,25 @@ ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_bf16__offset: @@ -3911,38 +3915,38 @@ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -3963,37 +3967,37 @@ ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4019,22 +4023,22 @@ ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4047,38 +4051,38 @@ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4093,37 +4097,37 @@ ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4138,28 +4142,28 @@ ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v3, v3 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 @@ -4182,20 +4186,20 @@ ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4217,20 +4221,20 @@ ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4252,22 +4256,22 @@ ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4288,19 +4292,19 @@ ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4321,19 +4325,19 @@ ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4364,29 +4368,30 @@ ; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -4417,28 +4422,29 @@ ; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4465,22 +4471,22 @@ ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4504,28 +4510,29 @@ ; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4551,27 +4558,28 @@ ; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4594,21 +4602,21 @@ ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -4632,20 +4640,20 @@ ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4668,20 +4676,20 @@ ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4704,22 +4712,22 @@ ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4741,19 +4749,19 @@ ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4775,19 +4783,19 @@ ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4889,13 +4897,14 @@ ; GFX942-LABEL: local_atomic_fmax_ret_bf16__offset__align4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534 +; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 ; GFX942-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX942-NEXT: v_max_f32_e32 v1, 4.0, v1 ; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1 @@ -4910,7 +4919,6 @@ ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5027,13 +5035,14 @@ ; GFX90A-LABEL: local_atomic_fmax_ret_bf16__offset__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534 +; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX90A-NEXT: v_max_f32_e32 v1, 4.0, v1 ; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1 @@ -5047,7 +5056,6 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5189,26 +5197,27 @@ ; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, 4.0, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, 4.0, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -5231,25 +5240,26 @@ ; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, 4.0, v2 -; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v1, 4.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -5270,21 +5280,21 @@ ; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX942-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX942-NEXT: v_add3_u32 v3, v3, v2, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX942-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX942-NEXT: v_add3_u32 v3, v3, v1, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX942-NEXT: v_and_or_b32 v2, v1, s3, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX942-NEXT: v_and_or_b32 v1, v2, s3, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB19_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5300,25 +5310,26 @@ ; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5336,24 +5347,25 @@ ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5370,21 +5382,21 @@ ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -5402,20 +5414,20 @@ ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v1, s7, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX90A-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5432,20 +5444,20 @@ ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX908-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v3, v3, v2, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: v_and_or_b32 v2, v1, s7, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX908-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX908-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5461,21 +5473,21 @@ ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5491,17 +5503,17 @@ ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5518,17 +5530,17 @@ ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5580,25 +5592,25 @@ ; GFX942-LABEL: local_atomic_fmax_ret_v2f16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v3, v0 +; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v1, v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v1, v1, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB20_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_ret_v2f16: @@ -5656,24 +5668,24 @@ ; GFX90A-LABEL: local_atomic_fmax_ret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v1, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_v2f16: @@ -5852,25 +5864,25 @@ ; GFX942-LABEL: local_atomic_fmax_ret_v2f16__offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX942-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v1, v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v1, v1, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB21_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_ret_v2f16__offset: @@ -5928,24 +5940,24 @@ ; GFX90A-LABEL: local_atomic_fmax_ret_v2f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v1, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_v2f16__offset: @@ -6101,15 +6113,15 @@ ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6129,14 +6141,14 @@ ; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v3, v3, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB22_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6152,15 +6164,15 @@ ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6178,14 +6190,14 @@ ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB22_1 @@ -6202,13 +6214,13 @@ ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_max_f16 v3, v3, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6224,13 +6236,13 @@ ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6248,16 +6260,16 @@ ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6363,15 +6375,15 @@ ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6391,14 +6403,14 @@ ; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v3, v3, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB23_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6414,15 +6426,15 @@ ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6440,14 +6452,14 @@ ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 @@ -6464,13 +6476,13 @@ ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_max_f16 v3, v3, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6486,13 +6498,13 @@ ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6510,16 +6522,16 @@ ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6726,41 +6738,41 @@ ; GFX942-LABEL: local_atomic_fmax_ret_v2bf16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v3, v0 +; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v4 -; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX942-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v1, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v1, v1 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 +; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB24_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_v2bf16: @@ -6898,40 +6910,40 @@ ; GFX90A-LABEL: local_atomic_fmax_ret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v4 -; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 +; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_v2bf16: @@ -7204,41 +7216,41 @@ ; GFX942-LABEL: local_atomic_fmax_ret_v2bf16__offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v4 -; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX942-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v1, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v1, v1 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 +; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB25_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_v2bf16__offset: @@ -7376,40 +7388,40 @@ ; GFX90A-LABEL: local_atomic_fmax_ret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v4 -; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 +; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_v2bf16__offset: @@ -7589,31 +7601,34 @@ ; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v1 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7638,32 +7653,33 @@ ; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v2 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -7686,27 +7702,27 @@ ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX942-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB26_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7724,30 +7740,32 @@ ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7768,30 +7786,32 @@ ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7811,27 +7831,27 @@ ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX10-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -7851,26 +7871,26 @@ ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7889,26 +7909,26 @@ ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7926,29 +7946,29 @@ ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8047,31 +8067,34 @@ ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v1 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8096,32 +8119,33 @@ ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v2 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8144,27 +8168,27 @@ ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX942-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8182,30 +8206,32 @@ ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8226,30 +8252,32 @@ ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -8269,27 +8297,27 @@ ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX10-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -8309,26 +8337,26 @@ ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8347,26 +8375,26 @@ ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8384,29 +8412,29 @@ ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 4a6428e..0aa8d33 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
@@ -886,21 +886,21 @@ ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX942-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX942-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX942-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_f16: @@ -1025,21 +1025,21 @@ ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX90A-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_f16: @@ -1285,21 +1285,21 @@ ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX942-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX942-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX942-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_f16__offset: @@ -1430,21 +1430,21 @@ ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX90A-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_f16__offset: @@ -1598,29 +1598,29 @@ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -1641,29 +1641,29 @@ ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -1688,16 +1688,16 @@ ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX942-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX942-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX942-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1710,29 +1710,29 @@ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1747,29 +1747,29 @@ ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1784,24 +1784,24 @@ ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v3, v3 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX10-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 @@ -1823,16 +1823,16 @@ ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX90A-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1853,16 +1853,16 @@ ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX908-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1884,17 +1884,17 @@ ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1915,18 +1915,18 @@ ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1947,18 +1947,18 @@ ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1989,20 +1989,21 @@ ; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2033,21 +2034,21 @@ ; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2073,16 +2074,16 @@ ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX942-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX942-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX942-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2105,20 +2106,21 @@ ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2143,21 +2145,21 @@ ; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2180,17 +2182,17 @@ ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX10-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 @@ -2213,16 +2215,16 @@ ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX90A-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2244,16 +2246,16 @@ ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX908-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2276,17 +2278,17 @@ ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2308,18 +2310,18 @@ ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2341,18 +2343,18 @@ ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2437,12 +2439,13 @@ ; GFX942-LABEL: local_atomic_fmin_ret_f16__offset__align4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534 +; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX942-NEXT: v_min_f16_e32 v1, 4.0, v1 ; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 @@ -2450,7 +2453,6 @@ ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2545,12 +2547,13 @@ ; GFX90A-LABEL: local_atomic_fmin_ret_f16__offset__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534 +; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX90A-NEXT: v_min_f16_e32 v1, 4.0, v1 ; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 @@ -2558,7 +2561,6 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2685,17 +2687,18 @@ ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, 4.0, v2.l -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, 4.0, v1.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2718,18 +2721,18 @@ ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v1, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, 4.0, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v2, v2 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v1, 4.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2749,14 +2752,14 @@ ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX942-NEXT: v_min_f16_e32 v2, 4.0, v2 -; GFX942-NEXT: v_and_or_b32 v2, v1, s2, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX942-NEXT: v_min_f16_e32 v1, 4.0, v1 +; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2771,17 +2774,18 @@ ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, 4.0, v2.l -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, 4.0, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2798,18 +2802,18 @@ ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, 4.0, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v1, 4.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2826,16 +2830,16 @@ ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX10-NEXT: v_min_f16_e32 v2, 4.0, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX10-NEXT: v_min_f16_e32 v1, 4.0, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 @@ -2852,14 +2856,14 @@ ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX90A-NEXT: v_min_f16_e32 v2, 4.0, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v1, s6, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX90A-NEXT: v_min_f16_e32 v1, 4.0, v1 +; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2875,14 +2879,14 @@ ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX908-NEXT: v_min_f16_e32 v2, 4.0, v2 -; GFX908-NEXT: v_and_or_b32 v2, v1, s6, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX908-NEXT: v_min_f16_e32 v1, 4.0, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2898,15 +2902,15 @@ ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_min_f16_e32 v2, 4.0, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_min_f16_e32 v1, 4.0, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2922,16 +2926,16 @@ ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX7-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2948,16 +2952,16 @@ ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX6-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3093,27 +3097,27 @@ ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_bf16: @@ -3260,25 +3264,25 @@ ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_bf16: @@ -3554,27 +3558,27 @@ ; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_bf16__offset: @@ -3727,25 +3731,25 @@ ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_bf16__offset: @@ -3911,38 +3915,38 @@ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -3963,37 +3967,37 @@ ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4019,22 +4023,22 @@ ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4047,38 +4051,38 @@ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4093,37 +4097,37 @@ ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4138,28 +4142,28 @@ ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v3, v3 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 @@ -4182,20 +4186,20 @@ ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4217,20 +4221,20 @@ ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4252,22 +4256,22 @@ ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4288,19 +4292,19 @@ ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4321,19 +4325,19 @@ ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4364,29 +4368,30 @@ ; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -4417,28 +4422,29 @@ ; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4465,22 +4471,22 @@ ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4504,28 +4510,29 @@ ; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4551,27 +4558,28 @@ ; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4594,21 +4602,21 @@ ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -4632,20 +4640,20 @@ ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4668,20 +4676,20 @@ ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4704,22 +4712,22 @@ ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4741,19 +4749,19 @@ ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4775,19 +4783,19 @@ ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4889,13 +4897,14 @@ ; GFX942-LABEL: local_atomic_fmin_ret_bf16__offset__align4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534 +; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 ; GFX942-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX942-NEXT: v_min_f32_e32 v1, 4.0, v1 ; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1 @@ -4910,7 +4919,6 @@ ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5027,13 +5035,14 @@ ; GFX90A-LABEL: local_atomic_fmin_ret_bf16__offset__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534 +; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX90A-NEXT: v_min_f32_e32 v1, 4.0, v1 ; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1 @@ -5047,7 +5056,6 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5189,26 +5197,27 @@ ; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, 4.0, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, 4.0, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -5231,25 +5240,26 @@ ; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, 4.0, v2 -; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v1, 4.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -5270,21 +5280,21 @@ ; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX942-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX942-NEXT: v_add3_u32 v3, v3, v2, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX942-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX942-NEXT: v_add3_u32 v3, v3, v1, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX942-NEXT: v_and_or_b32 v2, v1, s3, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX942-NEXT: v_and_or_b32 v1, v2, s3, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB19_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5300,25 +5310,26 @@ ; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5336,24 +5347,25 @@ ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5370,21 +5382,21 @@ ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -5402,20 +5414,20 @@ ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v1, s7, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX90A-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5432,20 +5444,20 @@ ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX908-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v3, v3, v2, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: v_and_or_b32 v2, v1, s7, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX908-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX908-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5461,21 +5473,21 @@ ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5491,17 +5503,17 @@ ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5518,17 +5530,17 @@ ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5580,25 +5592,25 @@ ; GFX942-LABEL: local_atomic_fmin_ret_v2f16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v3, v0 +; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v1, v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v1, v1, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB20_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_ret_v2f16: @@ -5656,24 +5668,24 @@ ; GFX90A-LABEL: local_atomic_fmin_ret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v1, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v1, v1, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_v2f16: @@ -5852,25 +5864,25 @@ ; GFX942-LABEL: local_atomic_fmin_ret_v2f16__offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX942-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v1, v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v1, v1, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB21_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_ret_v2f16__offset: @@ -5928,24 +5940,24 @@ ; GFX90A-LABEL: local_atomic_fmin_ret_v2f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v1, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v1, v1, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_v2f16__offset: @@ -6101,15 +6113,15 @@ ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6129,14 +6141,14 @@ ; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v3, v3, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB22_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6152,15 +6164,15 @@ ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_min_f16 v2, v2, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6178,14 +6190,14 @@ ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_min_f16 v2, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB22_1 @@ -6202,13 +6214,13 @@ ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_min_f16 v3, v3, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6224,13 +6236,13 @@ ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6248,16 +6260,16 @@ ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6363,15 +6375,15 @@ ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6391,14 +6403,14 @@ ; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v3, v3, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB23_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6414,15 +6426,15 @@ ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_min_f16 v2, v2, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6440,14 +6452,14 @@ ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_min_f16 v2, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 @@ -6464,13 +6476,13 @@ ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_min_f16 v3, v3, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6486,13 +6498,13 @@ ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6510,16 +6522,16 @@ ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6726,41 +6738,41 @@ ; GFX942-LABEL: local_atomic_fmin_ret_v2bf16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v3, v0 +; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v4 -; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX942-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v1, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v1, v1 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 +; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB24_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_v2bf16: @@ -6898,40 +6910,40 @@ ; GFX90A-LABEL: local_atomic_fmin_ret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v4 -; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 +; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_v2bf16: @@ -7204,41 +7216,41 @@ ; GFX942-LABEL: local_atomic_fmin_ret_v2bf16__offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v4 -; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX942-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v1, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v1, v1 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 +; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB25_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_v2bf16__offset: @@ -7376,40 +7388,40 @@ ; GFX90A-LABEL: local_atomic_fmin_ret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v4 -; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 +; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_v2bf16__offset: @@ -7589,31 +7601,34 @@ ; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v1 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7638,32 +7653,33 @@ ; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v2 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -7686,27 +7702,27 @@ ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX942-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB26_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7724,30 +7740,32 @@ ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7768,30 +7786,32 @@ ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7811,27 +7831,27 @@ ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX10-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -7851,26 +7871,26 @@ ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7889,26 +7909,26 @@ ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7926,29 +7946,29 @@ ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8047,31 +8067,34 @@ ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v1 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8096,32 +8119,33 @@ ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v2 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8144,27 +8168,27 @@ ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX942-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8182,30 +8206,32 @@ ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8226,30 +8252,32 @@ ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -8269,27 +8297,27 @@ ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX10-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -8309,26 +8337,26 @@ ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8347,26 +8375,26 @@ ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8384,29 +8412,29 @@ ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 7f95e7d..929bb61 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
@@ -50,17 +50,17 @@ ; GFX942-LABEL: local_atomic_fsub_ret_f32: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 +; GFX942-NEXT: ds_read_b32 v1, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -119,17 +119,17 @@ ; GFX90A-LABEL: local_atomic_fsub_ret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 +; GFX90A-NEXT: ds_read_b32 v1, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -262,17 +262,17 @@ ; GFX942-LABEL: local_atomic_fsub_ret_f32__offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX942-NEXT: ds_read_b32 v1, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -331,17 +331,17 @@ ; GFX90A-LABEL: local_atomic_fsub_ret_f32__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -453,13 +453,14 @@ ; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX12-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -478,12 +479,12 @@ ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -498,13 +499,14 @@ ; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -521,13 +523,13 @@ ; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB2_1 @@ -543,12 +545,12 @@ ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -563,12 +565,12 @@ ; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -584,12 +586,12 @@ ; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -605,12 +607,12 @@ ; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB2_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -626,12 +628,12 @@ ; GFX6-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB2_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -654,13 +656,14 @@ ; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX12-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -679,12 +682,12 @@ ; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB3_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -699,13 +702,14 @@ ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -722,13 +726,13 @@ ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 @@ -744,12 +748,12 @@ ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -764,12 +768,12 @@ ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -785,12 +789,12 @@ ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -806,12 +810,12 @@ ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB3_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -828,12 +832,12 @@ ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB3_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -883,18 +887,18 @@ ; GFX942-LABEL: local_atomic_fsub_ret_f64: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b64 v[4:5], v0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: ds_read_b64 v[0:1], v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] ; GFX942-NEXT: v_add_f64 v[0:1], v[4:5], -4.0 ; GFX942-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB4_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -953,18 +957,18 @@ ; GFX90A-LABEL: local_atomic_fsub_ret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b64 v[4:5], v0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: ds_read_b64 v[0:1], v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[0:1], v[4:5], -4.0 ; GFX90A-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1100,18 +1104,18 @@ ; GFX942-LABEL: local_atomic_fsub_ret_f64__offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b64 v[4:5], v0 offset:65528 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: ds_read_b64 v[0:1], v0 offset:65528 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] ; GFX942-NEXT: v_add_f64 v[0:1], v[4:5], -4.0 ; GFX942-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] offset:65528 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB5_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1170,18 +1174,18 @@ ; GFX90A-LABEL: local_atomic_fsub_ret_f64__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b64 v[4:5], v0 offset:65528 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: ds_read_b64 v[0:1], v0 offset:65528 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[0:1], v[4:5], -4.0 ; GFX90A-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] offset:65528 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1296,13 +1300,14 @@ ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[3:4], -4.0, v[1:2] +; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[1:2], -4.0, v[3:4] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] +; GFX12-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1321,12 +1326,12 @@ ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_add_f64 v[4:5], v[2:3], -4.0 -; GFX942-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX942-NEXT: v_add_f64 v[2:3], v[4:5], -4.0 +; GFX942-NEXT: ds_cmpst_rtn_b64 v[2:3], v0, v[4:5], v[2:3] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1341,13 +1346,14 @@ ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] +; GFX11-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1364,14 +1370,14 @@ ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX10-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB6_1 @@ -1387,12 +1393,12 @@ ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], -4.0 -; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], -4.0 +; GFX90A-NEXT: ds_cmpst_rtn_b64 v[2:3], v0, v[4:5], v[2:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1407,13 +1413,13 @@ ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 -; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 +; GFX908-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1429,13 +1435,13 @@ ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 -; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 +; GFX8-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1451,13 +1457,13 @@ ; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 -; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 +; GFX7-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB6_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1473,13 +1479,13 @@ ; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 +; GFX6-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v2, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB6_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1502,13 +1508,14 @@ ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[3:4], -4.0, v[1:2] +; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[1:2], -4.0, v[3:4] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528 +; GFX12-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1527,12 +1534,12 @@ ; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_add_f64 v[4:5], v[2:3], -4.0 -; GFX942-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] offset:65528 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX942-NEXT: v_add_f64 v[2:3], v[4:5], -4.0 +; GFX942-NEXT: ds_cmpst_rtn_b64 v[2:3], v0, v[4:5], v[2:3] offset:65528 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB7_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1547,13 +1554,14 @@ ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528 +; GFX11-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] offset:65528 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1570,14 +1578,14 @@ ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 +; GFX10-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB7_1 @@ -1593,12 +1601,12 @@ ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], -4.0 -; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] offset:65528 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], -4.0 +; GFX90A-NEXT: ds_cmpst_rtn_b64 v[2:3], v0, v[4:5], v[2:3] offset:65528 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1613,13 +1621,13 @@ ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 -; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 +; GFX908-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1635,13 +1643,13 @@ ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 -; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 +; GFX8-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1657,13 +1665,13 @@ ; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 -; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 +; GFX7-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1680,13 +1688,13 @@ ; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f64 v[3:4], v[0:1], -4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v2, v[0:1], v[3:4] +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: v_add_f64 v[0:1], v[3:4], -4.0 +; GFX6-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[0:1] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1791,30 +1799,30 @@ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX942-NEXT: ds_read_b32 v3, v1 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX942-NEXT: ds_read_b32 v2, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_and_b32_e32 v0, 24, v2 -; GFX942-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX942-NEXT: v_not_b32_e32 v2, v2 +; GFX942-NEXT: v_and_b32_e32 v0, 24, v3 +; GFX942-NEXT: v_lshlrev_b32_e64 v3, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v3, v3 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX942-NEXT: v_add_f16_e32 v2, -4.0, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX942-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_f16: @@ -1925,30 +1933,30 @@ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX90A-NEXT: ds_read_b32 v3, v1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX90A-NEXT: ds_read_b32 v2, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_and_b32_e32 v0, 24, v2 -; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v2, s4 -; GFX90A-NEXT: v_not_b32_e32 v2, v2 +; GFX90A-NEXT: v_and_b32_e32 v0, 24, v3 +; GFX90A-NEXT: v_lshlrev_b32_e64 v3, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v3, v3 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX90A-NEXT: v_add_f16_e32 v2, -4.0, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_ret_f16: @@ -2179,30 +2187,30 @@ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_add_u32_e32 v0, 0xfffe, v0 ; GFX942-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX942-NEXT: ds_read_b32 v3, v1 +; GFX942-NEXT: ds_read_b32 v2, v1 ; GFX942-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v2, v0, s0 -; GFX942-NEXT: v_not_b32_e32 v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e64 v3, v0, s0 +; GFX942-NEXT: v_not_b32_e32 v3, v3 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX942-NEXT: v_add_f16_e32 v2, -4.0, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX942-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_f16__offset: @@ -2319,30 +2327,30 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v0, 0xfffe, v0 ; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX90A-NEXT: ds_read_b32 v3, v1 +; GFX90A-NEXT: ds_read_b32 v2, v1 ; GFX90A-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v0, s4 -; GFX90A-NEXT: v_not_b32_e32 v2, v2 +; GFX90A-NEXT: v_lshlrev_b32_e64 v3, v0, s4 +; GFX90A-NEXT: v_not_b32_e32 v3, v3 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX90A-NEXT: v_add_f16_e32 v2, -4.0, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_ret_f16__offset: @@ -2494,27 +2502,27 @@ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2535,28 +2543,28 @@ ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2581,15 +2589,15 @@ ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX942-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2602,27 +2610,27 @@ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2637,28 +2645,28 @@ ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2673,23 +2681,23 @@ ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v3, v3 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX10-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX10-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 @@ -2711,15 +2719,15 @@ ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX90A-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2740,15 +2748,15 @@ ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX908-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX908-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2770,16 +2778,16 @@ ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX8-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX8-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2800,18 +2808,18 @@ ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2832,18 +2840,18 @@ ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2874,19 +2882,19 @@ ; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2917,19 +2925,20 @@ ; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, -4.0, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2955,15 +2964,15 @@ ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX942-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX942-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2986,19 +2995,19 @@ ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3023,19 +3032,20 @@ ; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, -4.0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3058,16 +3068,16 @@ ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX10-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX10-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 @@ -3090,15 +3100,15 @@ ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX90A-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX90A-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3120,15 +3130,15 @@ ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX908-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX908-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3151,16 +3161,16 @@ ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX8-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX8-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3182,18 +3192,18 @@ ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3215,18 +3225,18 @@ ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3308,19 +3318,19 @@ ; GFX942-LABEL: local_atomic_fsub_ret_f16__offset__align4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534 +; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_add_f16_e32 v1, -4.0, v2 ; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 ; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3411,19 +3421,19 @@ ; GFX90A-LABEL: local_atomic_fsub_ret_f16__offset__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534 +; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_add_f16_e32 v1, -4.0, v2 ; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3548,16 +3558,16 @@ ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v2.l +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -3580,16 +3590,17 @@ ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_add_f16_e32 v2, -4.0, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, -4.0, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -3609,13 +3620,13 @@ ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_add_f16_e32 v2, -4.0, v1 -; GFX942-NEXT: v_and_or_b32 v2, v1, s2, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_add_f16_e32 v1, -4.0, v2 +; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3630,16 +3641,16 @@ ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v2.l +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3656,16 +3667,17 @@ ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_f16_e32 v2, -4.0, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, -4.0, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3682,15 +3694,15 @@ ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_f16_e32 v2, -4.0, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_add_f16_e32 v1, -4.0, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 @@ -3707,13 +3719,13 @@ ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_add_f16_e32 v2, -4.0, v1 -; GFX90A-NEXT: v_and_or_b32 v2, v1, s6, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_add_f16_e32 v1, -4.0, v2 +; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3729,13 +3741,13 @@ ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_add_f16_e32 v2, -4.0, v1 -; GFX908-NEXT: v_and_or_b32 v2, v1, s6, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_add_f16_e32 v1, -4.0, v2 +; GFX908-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3751,14 +3763,14 @@ ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f16_e32 v2, -4.0, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_add_f16_e32 v1, -4.0, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3774,16 +3786,16 @@ ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3800,16 +3812,16 @@ ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3945,27 +3957,27 @@ ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_bf16: @@ -4112,25 +4124,25 @@ ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_ret_bf16: @@ -4404,27 +4416,27 @@ ; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_bf16__offset: @@ -4577,25 +4589,25 @@ ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_ret_bf16__offset: @@ -4759,38 +4771,38 @@ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -4811,37 +4823,37 @@ ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4867,22 +4879,22 @@ ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4895,38 +4907,38 @@ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4941,37 +4953,37 @@ ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4986,28 +4998,28 @@ ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v3, v3 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 @@ -5030,20 +5042,20 @@ ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5065,20 +5077,20 @@ ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5100,22 +5112,22 @@ ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5136,18 +5148,18 @@ ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5168,18 +5180,18 @@ ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5210,29 +5222,30 @@ ; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -5263,28 +5276,29 @@ ; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -5311,22 +5325,22 @@ ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5350,28 +5364,29 @@ ; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5397,27 +5412,28 @@ ; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5440,21 +5456,21 @@ ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -5478,20 +5494,20 @@ ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5514,20 +5530,20 @@ ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5550,22 +5566,22 @@ ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5587,18 +5603,18 @@ ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5620,18 +5636,18 @@ ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5733,13 +5749,14 @@ ; GFX942-LABEL: local_atomic_fsub_ret_bf16__offset__align4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534 +; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 ; GFX942-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v1 ; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1 @@ -5754,7 +5771,6 @@ ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5871,13 +5887,14 @@ ; GFX90A-LABEL: local_atomic_fsub_ret_bf16__offset__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534 +; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v1 ; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1 @@ -5891,7 +5908,6 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6031,26 +6047,27 @@ ; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -6073,25 +6090,26 @@ ; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -6112,21 +6130,21 @@ ; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX942-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX942-NEXT: v_add3_u32 v3, v3, v2, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX942-NEXT: v_add3_u32 v3, v3, v1, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX942-NEXT: v_and_or_b32 v2, v1, s3, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX942-NEXT: v_and_or_b32 v1, v2, s3, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB19_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6142,25 +6160,26 @@ ; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6178,24 +6197,25 @@ ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6212,21 +6232,21 @@ ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -6244,20 +6264,20 @@ ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v1, s7, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX90A-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6274,20 +6294,20 @@ ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX908-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v3, v3, v2, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: v_and_or_b32 v2, v1, s7, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX908-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX908-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6303,21 +6323,21 @@ ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6333,16 +6353,16 @@ ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6359,16 +6379,16 @@ ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6418,17 +6438,17 @@ ; GFX942-LABEL: local_atomic_fsub_ret_v2f16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v3, v0 +; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB20_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6487,17 +6507,17 @@ ; GFX90A-LABEL: local_atomic_fsub_ret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6673,17 +6693,17 @@ ; GFX942-LABEL: local_atomic_fsub_ret_v2f16__offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB21_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6742,17 +6762,17 @@ ; GFX90A-LABEL: local_atomic_fsub_ret_v2f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6906,13 +6926,14 @@ ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6931,12 +6952,12 @@ ; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB22_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6951,13 +6972,14 @@ ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6974,13 +6996,13 @@ ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB22_1 @@ -6996,12 +7018,12 @@ ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7016,12 +7038,12 @@ ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7037,14 +7059,14 @@ ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_sub_f16_e32 v4, v2, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_sub_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_e32 v4, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7149,13 +7171,14 @@ ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7174,12 +7197,12 @@ ; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB23_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7194,13 +7217,14 @@ ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7217,13 +7241,13 @@ ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 @@ -7239,12 +7263,12 @@ ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7259,12 +7283,12 @@ ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7280,14 +7304,14 @@ ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_sub_f16_e32 v4, v2, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_sub_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_e32 v4, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7494,41 +7518,41 @@ ; GFX942-LABEL: local_atomic_fsub_ret_v2bf16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v3, v0 +; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX942-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v1, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v1, v1 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 +; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB24_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_v2bf16: @@ -7666,40 +7690,40 @@ ; GFX90A-LABEL: local_atomic_fsub_ret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 +; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_ret_v2bf16: @@ -7972,41 +7996,41 @@ ; GFX942-LABEL: local_atomic_fsub_ret_v2bf16__offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX942-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v1, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v1, v1 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 +; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB25_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_v2bf16__offset: @@ -8144,40 +8168,40 @@ ; GFX90A-LABEL: local_atomic_fsub_ret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 +; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_ret_v2bf16__offset: @@ -8357,31 +8381,34 @@ ; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v1 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8406,32 +8433,33 @@ ; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8454,27 +8482,27 @@ ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX942-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB26_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8492,30 +8520,32 @@ ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8536,30 +8566,32 @@ ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -8579,27 +8611,27 @@ ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -8619,26 +8651,26 @@ ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8657,26 +8689,26 @@ ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8694,29 +8726,29 @@ ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8815,31 +8847,34 @@ ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v1 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8864,32 +8899,33 @@ ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8912,27 +8948,27 @@ ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX942-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8950,30 +8986,32 @@ ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8994,30 +9032,32 @@ ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -9037,27 +9077,27 @@ ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -9077,26 +9117,26 @@ ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9115,26 +9155,26 @@ ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9152,29 +9192,29 @@ ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9299,17 +9339,17 @@ ; GFX942-LABEL: local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 +; GFX942-NEXT: ds_read_b32 v1, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB28_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9368,17 +9408,17 @@ ; GFX90A-LABEL: local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 +; GFX90A-NEXT: ds_read_b32 v1, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9489,13 +9529,14 @@ ; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX12-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -9514,12 +9555,12 @@ ; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB29_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9534,13 +9575,14 @@ ; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -9557,13 +9599,13 @@ ; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB29_1 @@ -9579,12 +9621,12 @@ ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9599,12 +9641,12 @@ ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB29_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9620,12 +9662,12 @@ ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB29_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9641,12 +9683,12 @@ ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9662,12 +9704,12 @@ ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll index 5c90957..bcece19 100644 --- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll +++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
@@ -16,11 +16,11 @@ ; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: ; %bb.3: ; %.then ; GCN-NEXT: s_or_saveexec_b32 s1, -1 -; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v3, s1 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v3, s1 +; GCN-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s1 -; GCN-NEXT: v_mov_b32_e32 v0, v2 +; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: v_mov_b32_e32 v4, -1 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: buffer_store_dword v4, v0, s[4:7], 0 offen
diff --git a/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll b/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll index 895b68b..ce40085 100644 --- a/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll +++ b/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll
@@ -25,8 +25,7 @@ %11 = sub nsw i32 %7, %9 %12 = icmp slt i32 %10, %11 br i1 %12, label %5, label %13 -; CHECK: r1 = r3 -; CHECK: if r2 s> r3 goto -10 <test+0x40> +; CHECK: if r2 s> r1 goto -10 <test+0x40> ; <label>:13: ; preds = %5, %2 %14 = phi i32 [ 0, %2 ], [ %9, %5 ]
diff --git a/llvm/test/CodeGen/Hexagon/swp-stages5.ll b/llvm/test/CodeGen/Hexagon/swp-stages5.ll index f3bc889..d6c4782 100644 --- a/llvm/test/CodeGen/Hexagon/swp-stages5.ll +++ b/llvm/test/CodeGen/Hexagon/swp-stages5.ll
@@ -8,6 +8,7 @@ ; CHECK-DAG: loop0(.LBB0_[[LOOP:.]], ; CHECK: .LBB0_[[LOOP]]: ; CHECK: = and([[REG0]],#255) +; CHECK: [[REG0]]{{[:0-9]*}} = ; CHECK: endloop define void @fred(ptr noalias nocapture %src, i32 %srcWidth, i32 %srcHeight, i32 %srcStride, ptr noalias nocapture %dst, i32 %dstStride) #0 {
diff --git a/llvm/test/CodeGen/NVPTX/atomics-b128.ll b/llvm/test/CodeGen/NVPTX/atomics-b128.ll index b2a3f94..3057e91 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-b128.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-b128.ll
@@ -756,24 +756,24 @@ ; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; ; CHECK-NEXT: $L__BB34_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: and.b64 %rd6, %rd11, %rd4; -; CHECK-NEXT: and.b64 %rd7, %rd12, %rd5; +; CHECK-NEXT: mov.b64 %rd2, %rd12; +; CHECK-NEXT: mov.b64 %rd1, %rd11; +; CHECK-NEXT: and.b64 %rd6, %rd1, %rd4; +; CHECK-NEXT: and.b64 %rd7, %rd2, %rd5; ; CHECK-NEXT: { ; CHECK-NEXT: .reg .b128 cmp, swap, dst; -; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 cmp, {%rd1, %rd2}; ; CHECK-NEXT: mov.b128 swap, {%rd6, %rd7}; ; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; -; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: mov.b128 {%rd11, %rd12}, dst; ; CHECK-NEXT: } -; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; -; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: xor.b64 %rd8, %rd12, %rd2; +; CHECK-NEXT: xor.b64 %rd9, %rd11, %rd1; ; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; ; CHECK-NEXT: setp.ne.b64 %p1, %rd10, 0; -; CHECK-NEXT: mov.b64 %rd11, %rd1; -; CHECK-NEXT: mov.b64 %rd12, %rd2; ; CHECK-NEXT: @%p1 bra $L__BB34_1; ; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd11, %rd12}; ; CHECK-NEXT: ret; %ret = atomicrmw and ptr %ptr, i128 %val monotonic ret i128 %ret @@ -791,24 +791,24 @@ ; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; ; CHECK-NEXT: $L__BB35_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: or.b64 %rd6, %rd11, %rd4; -; CHECK-NEXT: or.b64 %rd7, %rd12, %rd5; +; CHECK-NEXT: mov.b64 %rd2, %rd12; +; CHECK-NEXT: mov.b64 %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd6, %rd1, %rd4; +; CHECK-NEXT: or.b64 %rd7, %rd2, %rd5; ; CHECK-NEXT: { ; CHECK-NEXT: .reg .b128 cmp, swap, dst; -; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 cmp, {%rd1, %rd2}; ; CHECK-NEXT: mov.b128 swap, {%rd6, %rd7}; ; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; -; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: mov.b128 {%rd11, %rd12}, dst; ; CHECK-NEXT: } -; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; -; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: xor.b64 %rd8, %rd12, %rd2; +; CHECK-NEXT: xor.b64 %rd9, %rd11, %rd1; ; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; ; CHECK-NEXT: setp.ne.b64 %p1, %rd10, 0; -; CHECK-NEXT: mov.b64 %rd11, %rd1; -; CHECK-NEXT: mov.b64 %rd12, %rd2; ; CHECK-NEXT: @%p1 bra $L__BB35_1; ; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd11, %rd12}; ; CHECK-NEXT: ret; %ret = atomicrmw or ptr %ptr, i128 %val monotonic ret i128 %ret @@ -826,24 +826,24 @@ ; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; ; CHECK-NEXT: $L__BB36_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: xor.b64 %rd6, %rd11, %rd4; -; CHECK-NEXT: xor.b64 %rd7, %rd12, %rd5; +; CHECK-NEXT: mov.b64 %rd2, %rd12; +; CHECK-NEXT: mov.b64 %rd1, %rd11; +; CHECK-NEXT: xor.b64 %rd6, %rd1, %rd4; +; CHECK-NEXT: xor.b64 %rd7, %rd2, %rd5; ; CHECK-NEXT: { ; CHECK-NEXT: .reg .b128 cmp, swap, dst; -; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 cmp, {%rd1, %rd2}; ; CHECK-NEXT: mov.b128 swap, {%rd6, %rd7}; ; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; -; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: mov.b128 {%rd11, %rd12}, dst; ; CHECK-NEXT: } -; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; -; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: xor.b64 %rd8, %rd12, %rd2; +; CHECK-NEXT: xor.b64 %rd9, %rd11, %rd1; ; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; ; CHECK-NEXT: setp.ne.b64 %p1, %rd10, 0; -; CHECK-NEXT: mov.b64 %rd11, %rd1; -; CHECK-NEXT: mov.b64 %rd12, %rd2; ; CHECK-NEXT: @%p1 bra $L__BB36_1; ; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd11, %rd12}; ; CHECK-NEXT: ret; %ret = atomicrmw xor ptr %ptr, i128 %val monotonic ret i128 %ret @@ -861,29 +861,29 @@ ; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; ; CHECK-NEXT: $L__BB37_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: setp.lt.u64 %p1, %rd11, %rd4; -; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5; +; CHECK-NEXT: mov.b64 %rd2, %rd12; +; CHECK-NEXT: mov.b64 %rd1, %rd11; +; CHECK-NEXT: setp.lt.u64 %p1, %rd1, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd2, %rd5; ; CHECK-NEXT: and.pred %p3, %p2, %p1; -; CHECK-NEXT: setp.lt.s64 %p4, %rd12, %rd5; +; CHECK-NEXT: setp.lt.s64 %p4, %rd2, %rd5; ; CHECK-NEXT: or.pred %p5, %p3, %p4; -; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5; -; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5; +; CHECK-NEXT: selp.b64 %rd6, %rd2, %rd5, %p5; +; CHECK-NEXT: selp.b64 %rd7, %rd1, %rd4, %p5; ; CHECK-NEXT: { ; CHECK-NEXT: .reg .b128 cmp, swap, dst; -; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 cmp, {%rd1, %rd2}; ; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6}; ; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; -; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: mov.b128 {%rd11, %rd12}, dst; ; CHECK-NEXT: } -; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; -; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: xor.b64 %rd8, %rd12, %rd2; +; CHECK-NEXT: xor.b64 %rd9, %rd11, %rd1; ; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; ; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0; -; CHECK-NEXT: mov.b64 %rd11, %rd1; -; CHECK-NEXT: mov.b64 %rd12, %rd2; ; CHECK-NEXT: @%p6 bra $L__BB37_1; ; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd11, %rd12}; ; CHECK-NEXT: ret; %ret = atomicrmw min ptr %ptr, i128 %val monotonic ret i128 %ret @@ -901,29 +901,29 @@ ; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; ; CHECK-NEXT: $L__BB38_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: setp.gt.u64 %p1, %rd11, %rd4; -; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5; +; CHECK-NEXT: mov.b64 %rd2, %rd12; +; CHECK-NEXT: mov.b64 %rd1, %rd11; +; CHECK-NEXT: setp.gt.u64 %p1, %rd1, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd2, %rd5; ; CHECK-NEXT: and.pred %p3, %p2, %p1; -; CHECK-NEXT: setp.gt.s64 %p4, %rd12, %rd5; +; CHECK-NEXT: setp.gt.s64 %p4, %rd2, %rd5; ; CHECK-NEXT: or.pred %p5, %p3, %p4; -; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5; -; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5; +; CHECK-NEXT: selp.b64 %rd6, %rd2, %rd5, %p5; +; CHECK-NEXT: selp.b64 %rd7, %rd1, %rd4, %p5; ; CHECK-NEXT: { ; CHECK-NEXT: .reg .b128 cmp, swap, dst; -; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 cmp, {%rd1, %rd2}; ; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6}; ; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; -; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: mov.b128 {%rd11, %rd12}, dst; ; CHECK-NEXT: } -; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; -; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: xor.b64 %rd8, %rd12, %rd2; +; CHECK-NEXT: xor.b64 %rd9, %rd11, %rd1; ; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; ; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0; -; CHECK-NEXT: mov.b64 %rd11, %rd1; -; CHECK-NEXT: mov.b64 %rd12, %rd2; ; CHECK-NEXT: @%p6 bra $L__BB38_1; ; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd11, %rd12}; ; CHECK-NEXT: ret; %ret = atomicrmw max ptr %ptr, i128 %val monotonic ret i128 %ret @@ -941,29 +941,29 @@ ; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; ; CHECK-NEXT: $L__BB39_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: setp.lt.u64 %p1, %rd11, %rd4; -; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5; +; CHECK-NEXT: mov.b64 %rd2, %rd12; +; CHECK-NEXT: mov.b64 %rd1, %rd11; +; CHECK-NEXT: setp.lt.u64 %p1, %rd1, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd2, %rd5; ; CHECK-NEXT: and.pred %p3, %p2, %p1; -; CHECK-NEXT: setp.lt.u64 %p4, %rd12, %rd5; +; CHECK-NEXT: setp.lt.u64 %p4, %rd2, %rd5; ; CHECK-NEXT: or.pred %p5, %p3, %p4; -; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5; -; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5; +; CHECK-NEXT: selp.b64 %rd6, %rd2, %rd5, %p5; +; CHECK-NEXT: selp.b64 %rd7, %rd1, %rd4, %p5; ; CHECK-NEXT: { ; CHECK-NEXT: .reg .b128 cmp, swap, dst; -; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 cmp, {%rd1, %rd2}; ; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6}; ; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; -; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: mov.b128 {%rd11, %rd12}, dst; ; CHECK-NEXT: } -; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; -; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: xor.b64 %rd8, %rd12, %rd2; +; CHECK-NEXT: xor.b64 %rd9, %rd11, %rd1; ; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; ; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0; -; CHECK-NEXT: mov.b64 %rd11, %rd1; -; CHECK-NEXT: mov.b64 %rd12, %rd2; ; CHECK-NEXT: @%p6 bra $L__BB39_1; ; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd11, %rd12}; ; CHECK-NEXT: ret; %ret = atomicrmw umin ptr %ptr, i128 %val monotonic ret i128 %ret @@ -981,29 +981,29 @@ ; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; ; CHECK-NEXT: $L__BB40_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: setp.gt.u64 %p1, %rd11, %rd4; -; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5; +; CHECK-NEXT: mov.b64 %rd2, %rd12; +; CHECK-NEXT: mov.b64 %rd1, %rd11; +; CHECK-NEXT: setp.gt.u64 %p1, %rd1, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd2, %rd5; ; CHECK-NEXT: and.pred %p3, %p2, %p1; -; CHECK-NEXT: setp.gt.u64 %p4, %rd12, %rd5; +; CHECK-NEXT: setp.gt.u64 %p4, %rd2, %rd5; ; CHECK-NEXT: or.pred %p5, %p3, %p4; -; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5; -; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5; +; CHECK-NEXT: selp.b64 %rd6, %rd2, %rd5, %p5; +; CHECK-NEXT: selp.b64 %rd7, %rd1, %rd4, %p5; ; CHECK-NEXT: { ; CHECK-NEXT: .reg .b128 cmp, swap, dst; -; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 cmp, {%rd1, %rd2}; ; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6}; ; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; -; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: mov.b128 {%rd11, %rd12}, dst; ; CHECK-NEXT: } -; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; -; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: xor.b64 %rd8, %rd12, %rd2; +; CHECK-NEXT: xor.b64 %rd9, %rd11, %rd1; ; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; ; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0; -; CHECK-NEXT: mov.b64 %rd11, %rd1; -; CHECK-NEXT: mov.b64 %rd12, %rd2; ; CHECK-NEXT: @%p6 bra $L__BB40_1; ; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd11, %rd12}; ; CHECK-NEXT: ret; %ret = atomicrmw umax ptr %ptr, i128 %val monotonic ret i128 %ret
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll index e2762ba..313be95 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
@@ -63,32 +63,32 @@ ; CHECKPTX62-NEXT: ld.b32 %r46, [%r1]; ; CHECKPTX62-NEXT: $L__BB0_1: // %atomicrmw.start45 ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX62-NEXT: shr.u32 %r20, %r46, %r2; +; CHECKPTX62-NEXT: mov.b32 %r4, %r46; +; CHECKPTX62-NEXT: shr.u32 %r20, %r4, %r2; ; CHECKPTX62-NEXT: cvt.u16.u32 %rs2, %r20; ; CHECKPTX62-NEXT: add.rn.f16 %rs3, %rs2, %rs1; ; CHECKPTX62-NEXT: cvt.u32.u16 %r21, %rs3; ; CHECKPTX62-NEXT: shl.b32 %r22, %r21, %r2; -; CHECKPTX62-NEXT: and.b32 %r23, %r46, %r3; +; CHECKPTX62-NEXT: and.b32 %r23, %r4, %r3; ; CHECKPTX62-NEXT: or.b32 %r24, %r23, %r22; -; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r4, [%r1], %r46, %r24; -; CHECKPTX62-NEXT: setp.ne.b32 %p1, %r4, %r46; -; CHECKPTX62-NEXT: mov.b32 %r46, %r4; +; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r46, [%r1], %r4, %r24; +; CHECKPTX62-NEXT: setp.ne.b32 %p1, %r46, %r4; ; CHECKPTX62-NEXT: @%p1 bra $L__BB0_1; ; CHECKPTX62-NEXT: // %bb.2: // %atomicrmw.end44 ; CHECKPTX62-NEXT: ld.b32 %r47, [%r1]; ; CHECKPTX62-NEXT: $L__BB0_3: // %atomicrmw.start27 ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX62-NEXT: shr.u32 %r25, %r47, %r2; +; CHECKPTX62-NEXT: mov.b32 %r5, %r47; +; CHECKPTX62-NEXT: shr.u32 %r25, %r5, %r2; ; CHECKPTX62-NEXT: cvt.u16.u32 %rs4, %r25; ; CHECKPTX62-NEXT: mov.b16 %rs5, 0x3C00; ; CHECKPTX62-NEXT: add.rn.f16 %rs6, %rs4, %rs5; ; CHECKPTX62-NEXT: cvt.u32.u16 %r26, %rs6; ; CHECKPTX62-NEXT: shl.b32 %r27, %r26, %r2; -; CHECKPTX62-NEXT: and.b32 %r28, %r47, %r3; +; CHECKPTX62-NEXT: and.b32 %r28, %r5, %r3; ; CHECKPTX62-NEXT: or.b32 %r29, %r28, %r27; -; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r5, [%r1], %r47, %r29; -; CHECKPTX62-NEXT: setp.ne.b32 %p2, %r5, %r47; -; CHECKPTX62-NEXT: mov.b32 %r47, %r5; +; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r47, [%r1], %r5, %r29; +; CHECKPTX62-NEXT: setp.ne.b32 %p2, %r47, %r5; ; CHECKPTX62-NEXT: @%p2 bra $L__BB0_3; ; CHECKPTX62-NEXT: // %bb.4: // %atomicrmw.end26 ; CHECKPTX62-NEXT: and.b32 %r6, %r14, -4; @@ -100,16 +100,16 @@ ; CHECKPTX62-NEXT: ld.global.b32 %r48, [%r6]; ; CHECKPTX62-NEXT: $L__BB0_5: // %atomicrmw.start9 ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX62-NEXT: shr.u32 %r33, %r48, %r7; +; CHECKPTX62-NEXT: mov.b32 %r9, %r48; +; CHECKPTX62-NEXT: shr.u32 %r33, %r9, %r7; ; CHECKPTX62-NEXT: cvt.u16.u32 %rs7, %r33; ; CHECKPTX62-NEXT: add.rn.f16 %rs8, %rs7, %rs1; ; CHECKPTX62-NEXT: cvt.u32.u16 %r34, %rs8; ; CHECKPTX62-NEXT: shl.b32 %r35, %r34, %r7; -; CHECKPTX62-NEXT: and.b32 %r36, %r48, %r8; +; CHECKPTX62-NEXT: and.b32 %r36, %r9, %r8; ; CHECKPTX62-NEXT: or.b32 %r37, %r36, %r35; -; CHECKPTX62-NEXT: atom.relaxed.sys.global.cas.b32 %r9, [%r6], %r48, %r37; -; CHECKPTX62-NEXT: setp.ne.b32 %p3, %r9, %r48; -; CHECKPTX62-NEXT: mov.b32 %r48, %r9; +; CHECKPTX62-NEXT: atom.relaxed.sys.global.cas.b32 %r48, [%r6], %r9, %r37; +; CHECKPTX62-NEXT: setp.ne.b32 %p3, %r48, %r9; ; CHECKPTX62-NEXT: @%p3 bra $L__BB0_5; ; CHECKPTX62-NEXT: // %bb.6: // %atomicrmw.end8 ; CHECKPTX62-NEXT: and.b32 %r10, %r15, -4; @@ -121,16 +121,16 @@ ; CHECKPTX62-NEXT: ld.shared.b32 %r49, [%r10]; ; CHECKPTX62-NEXT: $L__BB0_7: // %atomicrmw.start ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX62-NEXT: shr.u32 %r41, %r49, %r11; +; CHECKPTX62-NEXT: mov.b32 %r13, %r49; +; CHECKPTX62-NEXT: shr.u32 %r41, %r13, %r11; ; CHECKPTX62-NEXT: cvt.u16.u32 %rs9, %r41; ; CHECKPTX62-NEXT: add.rn.f16 %rs10, %rs9, %rs1; ; CHECKPTX62-NEXT: cvt.u32.u16 %r42, %rs10; ; CHECKPTX62-NEXT: shl.b32 %r43, %r42, %r11; -; CHECKPTX62-NEXT: and.b32 %r44, %r49, %r12; +; CHECKPTX62-NEXT: and.b32 %r44, %r13, %r12; ; CHECKPTX62-NEXT: or.b32 %r45, %r44, %r43; -; CHECKPTX62-NEXT: atom.relaxed.sys.shared.cas.b32 %r13, [%r10], %r49, %r45; -; CHECKPTX62-NEXT: setp.ne.b32 %p4, %r13, %r49; -; CHECKPTX62-NEXT: mov.b32 %r49, %r13; +; CHECKPTX62-NEXT: atom.relaxed.sys.shared.cas.b32 %r49, [%r10], %r13, %r45; +; CHECKPTX62-NEXT: setp.ne.b32 %p4, %r49, %r13; ; CHECKPTX62-NEXT: @%p4 bra $L__BB0_7; ; CHECKPTX62-NEXT: // %bb.8: // %atomicrmw.end ; CHECKPTX62-NEXT: ret;
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll index e6c6a73..f5eefaa 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -63,33 +63,33 @@ ; CHECKPTX71-NEXT: ld.b32 %r46, [%r1]; ; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start45 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: shr.u32 %r20, %r46, %r2; +; CHECKPTX71-NEXT: mov.b32 %r4, %r46; +; CHECKPTX71-NEXT: shr.u32 %r20, %r4, %r2; ; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r20; ; CHECKPTX71-NEXT: mov.b16 %rs3, 0x3F80; ; CHECKPTX71-NEXT: fma.rn.bf16 %rs4, %rs2, %rs3, %rs1; ; CHECKPTX71-NEXT: cvt.u32.u16 %r21, %rs4; ; CHECKPTX71-NEXT: shl.b32 %r22, %r21, %r2; -; CHECKPTX71-NEXT: and.b32 %r23, %r46, %r3; +; CHECKPTX71-NEXT: and.b32 %r23, %r4, %r3; ; CHECKPTX71-NEXT: or.b32 %r24, %r23, %r22; -; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r4, [%r1], %r46, %r24; -; CHECKPTX71-NEXT: setp.ne.b32 %p1, %r4, %r46; -; CHECKPTX71-NEXT: mov.b32 %r46, %r4; +; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r46, [%r1], %r4, %r24; +; CHECKPTX71-NEXT: setp.ne.b32 %p1, %r46, %r4; ; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1; ; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end44 ; CHECKPTX71-NEXT: ld.b32 %r47, [%r1]; ; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start27 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: shr.u32 %r25, %r47, %r2; +; CHECKPTX71-NEXT: mov.b32 %r5, %r47; +; CHECKPTX71-NEXT: shr.u32 %r25, %r5, %r2; ; CHECKPTX71-NEXT: cvt.u16.u32 %rs5, %r25; ; CHECKPTX71-NEXT: mov.b16 %rs6, 0x3F80; ; CHECKPTX71-NEXT: fma.rn.bf16 %rs7, %rs5, %rs6, %rs6; ; CHECKPTX71-NEXT: cvt.u32.u16 %r26, %rs7; ; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2; -; CHECKPTX71-NEXT: and.b32 %r28, %r47, %r3; +; CHECKPTX71-NEXT: and.b32 %r28, %r5, %r3; ; CHECKPTX71-NEXT: or.b32 %r29, %r28, %r27; -; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r5, [%r1], %r47, %r29; -; CHECKPTX71-NEXT: setp.ne.b32 %p2, %r5, %r47; -; CHECKPTX71-NEXT: mov.b32 %r47, %r5; +; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r47, [%r1], %r5, %r29; +; CHECKPTX71-NEXT: setp.ne.b32 %p2, %r47, %r5; ; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3; ; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end26 ; CHECKPTX71-NEXT: and.b32 %r6, %r14, -4; @@ -101,17 +101,17 @@ ; CHECKPTX71-NEXT: ld.global.b32 %r48, [%r6]; ; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start9 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: shr.u32 %r33, %r48, %r7; +; CHECKPTX71-NEXT: mov.b32 %r9, %r48; +; CHECKPTX71-NEXT: shr.u32 %r33, %r9, %r7; ; CHECKPTX71-NEXT: cvt.u16.u32 %rs8, %r33; ; CHECKPTX71-NEXT: mov.b16 %rs9, 0x3F80; ; CHECKPTX71-NEXT: fma.rn.bf16 %rs10, %rs8, %rs9, %rs1; ; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs10; ; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r7; -; CHECKPTX71-NEXT: and.b32 %r36, %r48, %r8; +; CHECKPTX71-NEXT: and.b32 %r36, %r9, %r8; ; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35; -; CHECKPTX71-NEXT: atom.relaxed.sys.global.cas.b32 %r9, [%r6], %r48, %r37; -; CHECKPTX71-NEXT: setp.ne.b32 %p3, %r9, %r48; -; CHECKPTX71-NEXT: mov.b32 %r48, %r9; +; CHECKPTX71-NEXT: atom.relaxed.sys.global.cas.b32 %r48, [%r6], %r9, %r37; +; CHECKPTX71-NEXT: setp.ne.b32 %p3, %r48, %r9; ; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5; ; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end8 ; CHECKPTX71-NEXT: and.b32 %r10, %r15, -4; @@ -123,17 +123,17 @@ ; CHECKPTX71-NEXT: ld.shared.b32 %r49, [%r10]; ; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: shr.u32 %r41, %r49, %r11; +; CHECKPTX71-NEXT: mov.b32 %r13, %r49; +; CHECKPTX71-NEXT: shr.u32 %r41, %r13, %r11; ; CHECKPTX71-NEXT: cvt.u16.u32 %rs11, %r41; ; CHECKPTX71-NEXT: mov.b16 %rs12, 0x3F80; ; CHECKPTX71-NEXT: fma.rn.bf16 %rs13, %rs11, %rs12, %rs1; ; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs13; ; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11; -; CHECKPTX71-NEXT: and.b32 %r44, %r49, %r12; +; CHECKPTX71-NEXT: and.b32 %r44, %r13, %r12; ; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43; -; CHECKPTX71-NEXT: atom.relaxed.sys.shared.cas.b32 %r13, [%r10], %r49, %r45; -; CHECKPTX71-NEXT: setp.ne.b32 %p4, %r13, %r49; -; CHECKPTX71-NEXT: mov.b32 %r49, %r13; +; CHECKPTX71-NEXT: atom.relaxed.sys.shared.cas.b32 %r49, [%r10], %r13, %r45; +; CHECKPTX71-NEXT: setp.ne.b32 %p4, %r49, %r13; ; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7; ; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end ; CHECKPTX71-NEXT: ret;
diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll index 6ea02f3..a4b49f7 100644 --- a/llvm/test/CodeGen/NVPTX/atomics.ll +++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -442,22 +442,22 @@ ; CHECK-NEXT: cvt.f32.f16 %r10, %rs1; ; CHECK-NEXT: $L__BB24_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u32 %r8, %r17, %r1; +; CHECK-NEXT: mov.b32 %r3, %r17; +; CHECK-NEXT: shr.u32 %r8, %r3, %r1; ; CHECK-NEXT: cvt.u16.u32 %rs2, %r8; ; CHECK-NEXT: cvt.f32.f16 %r9, %rs2; ; CHECK-NEXT: add.rn.f32 %r11, %r9, %r10; ; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r11; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs3; ; CHECK-NEXT: shl.b32 %r13, %r12, %r1; -; CHECK-NEXT: and.b32 %r14, %r17, %r2; +; CHECK-NEXT: and.b32 %r14, %r3, %r2; ; CHECK-NEXT: or.b32 %r15, %r14, %r13; ; CHECK-NEXT: membar.sys; -; CHECK-NEXT: atom.cas.b32 %r3, [%rd1], %r17, %r15; -; CHECK-NEXT: setp.ne.b32 %p1, %r3, %r17; -; CHECK-NEXT: mov.b32 %r17, %r3; +; CHECK-NEXT: atom.cas.b32 %r17, [%rd1], %r3, %r15; +; CHECK-NEXT: setp.ne.b32 %p1, %r17, %r3; ; CHECK-NEXT: @%p1 bra $L__BB24_1; ; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: shr.u32 %r16, %r3, %r1; +; CHECK-NEXT: shr.u32 %r16, %r17, %r1; ; CHECK-NEXT: st.param.b16 [func_retval0], %r16; ; CHECK-NEXT: ret; %ret = atomicrmw fadd ptr %addr, half %val seq_cst
diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-fp128.ll b/llvm/test/CodeGen/PowerPC/ctrloop-fp128.ll index d6dd959..fdb0131 100644 --- a/llvm/test/CodeGen/PowerPC/ctrloop-fp128.ll +++ b/llvm/test/CodeGen/PowerPC/ctrloop-fp128.ll
@@ -49,15 +49,15 @@ ; PWR8-NEXT: # ; PWR8-NEXT: lxvd2x 0, 30, 28 ; PWR8-NEXT: vmr 2, 31 -; PWR8-NEXT: addi 26, 30, 16 +; PWR8-NEXT: mr 26, 30 +; PWR8-NEXT: addi 30, 30, 16 ; PWR8-NEXT: xxswapd 35, 0 ; PWR8-NEXT: bl __mulkf3 ; PWR8-NEXT: nop ; PWR8-NEXT: addi 29, 29, -1 ; PWR8-NEXT: xxswapd 0, 34 ; PWR8-NEXT: cmpldi 29, 0 -; PWR8-NEXT: stxvd2x 0, 30, 27 -; PWR8-NEXT: mr 30, 26 +; PWR8-NEXT: stxvd2x 0, 26, 27 ; PWR8-NEXT: bc 12, 1, .LBB0_1 ; PWR8-NEXT: # %bb.2: # %for.end ; PWR8-NEXT: li 3, 48
diff --git a/llvm/test/CodeGen/PowerPC/licm-xxsplti.ll b/llvm/test/CodeGen/PowerPC/licm-xxsplti.ll index 826e306..786988f 100644 --- a/llvm/test/CodeGen/PowerPC/licm-xxsplti.ll +++ b/llvm/test/CodeGen/PowerPC/licm-xxsplti.ll
@@ -23,11 +23,11 @@ ; AIX64-NEXT: # %bb.2: # %for.body.preheader.new ; AIX64-NEXT: rlwinm 6, 5, 0, 1, 30 ; AIX64-NEXT: xxspltib 0, 6 -; AIX64-NEXT: addi 9, 4, -8 +; AIX64-NEXT: addi 11, 4, -8 ; AIX64-NEXT: addi 7, 3, -8 ; AIX64-NEXT: li 8, 8 -; AIX64-NEXT: li 10, 12 -; AIX64-NEXT: li 11, 4 +; AIX64-NEXT: li 9, 12 +; AIX64-NEXT: li 10, 4 ; AIX64-NEXT: addi 6, 6, -2 ; AIX64-NEXT: rldicl 6, 6, 63, 1 ; AIX64-NEXT: addi 6, 6, 1 @@ -36,16 +36,16 @@ ; AIX64-NEXT: .align 4 ; AIX64-NEXT: L..BB0_3: # %for.body ; AIX64-NEXT: # -; AIX64-NEXT: lxvwsx 1, 9, 8 +; AIX64-NEXT: lxvwsx 1, 11, 8 ; AIX64-NEXT: addi 6, 6, 2 ; AIX64-NEXT: xxland 1, 1, 0 ; AIX64-NEXT: xscvspdpn 1, 1 ; AIX64-NEXT: stfsu 1, 8(7) -; AIX64-NEXT: lxvwsx 1, 9, 10 -; AIX64-NEXT: addi 9, 9, 8 +; AIX64-NEXT: lxvwsx 1, 11, 9 +; AIX64-NEXT: addi 11, 11, 8 ; AIX64-NEXT: xxland 1, 1, 0 ; AIX64-NEXT: xxsldwi 1, 1, 1, 3 -; AIX64-NEXT: stfiwx 1, 7, 11 +; AIX64-NEXT: stfiwx 1, 7, 10 ; AIX64-NEXT: bdnz L..BB0_3 ; AIX64-NEXT: L..BB0_4: # %for.cond.cleanup.loopexit.unr-lcssa ; AIX64-NEXT: andi. 5, 5, 1 @@ -79,16 +79,16 @@ ; AIX32-NEXT: L..BB0_3: # %for.body ; AIX32-NEXT: # ; AIX32-NEXT: lxvwsx 1, 12, 9 +; AIX32-NEXT: lxvwsx 2, 12, 10 ; AIX32-NEXT: addic 6, 6, 2 +; AIX32-NEXT: addi 12, 12, 8 ; AIX32-NEXT: addze 11, 11 ; AIX32-NEXT: xor 0, 6, 7 ; AIX32-NEXT: or. 0, 0, 11 ; AIX32-NEXT: xxland 1, 1, 0 ; AIX32-NEXT: xscvspdpn 1, 1 ; AIX32-NEXT: stfsu 1, 8(8) -; AIX32-NEXT: lxvwsx 1, 12, 10 -; AIX32-NEXT: addi 12, 12, 8 -; AIX32-NEXT: xxland 1, 1, 0 +; AIX32-NEXT: xxland 1, 2, 0 ; AIX32-NEXT: xscvspdpn 1, 1 ; AIX32-NEXT: stfs 1, 4(8) ; AIX32-NEXT: bne 0, L..BB0_3 @@ -116,11 +116,11 @@ ; LINUX64LE-NEXT: # %bb.2: # %for.body.preheader.new ; LINUX64LE-NEXT: rlwinm 6, 5, 0, 1, 30 ; LINUX64LE-NEXT: xxspltib 0, 6 -; LINUX64LE-NEXT: addi 8, 4, -8 +; LINUX64LE-NEXT: addi 11, 4, -8 ; LINUX64LE-NEXT: addi 7, 3, -8 -; LINUX64LE-NEXT: li 9, 8 -; LINUX64LE-NEXT: li 10, 12 -; LINUX64LE-NEXT: li 11, 4 +; LINUX64LE-NEXT: li 8, 8 +; LINUX64LE-NEXT: li 9, 12 +; LINUX64LE-NEXT: li 10, 4 ; LINUX64LE-NEXT: addi 6, 6, -2 ; LINUX64LE-NEXT: rldicl 6, 6, 63, 1 ; LINUX64LE-NEXT: addi 6, 6, 1 @@ -129,16 +129,16 @@ ; LINUX64LE-NEXT: .p2align 4 ; LINUX64LE-NEXT: .LBB0_3: # %for.body ; LINUX64LE-NEXT: # -; LINUX64LE-NEXT: lxvwsx 1, 8, 9 +; LINUX64LE-NEXT: lxvwsx 1, 11, 8 ; LINUX64LE-NEXT: addi 6, 6, 2 ; LINUX64LE-NEXT: xxland 1, 1, 0 ; LINUX64LE-NEXT: xxsldwi 1, 1, 1, 3 ; LINUX64LE-NEXT: xscvspdpn 1, 1 ; LINUX64LE-NEXT: stfsu 1, 8(7) -; LINUX64LE-NEXT: lxvwsx 1, 8, 10 -; LINUX64LE-NEXT: addi 8, 8, 8 +; LINUX64LE-NEXT: lxvwsx 1, 11, 9 +; LINUX64LE-NEXT: addi 11, 11, 8 ; LINUX64LE-NEXT: xxland 1, 1, 0 -; LINUX64LE-NEXT: stxvrwx 1, 7, 11 +; LINUX64LE-NEXT: stxvrwx 1, 7, 10 ; LINUX64LE-NEXT: bdnz .LBB0_3 ; LINUX64LE-NEXT: .LBB0_4: # %for.cond.cleanup.loopexit.unr-lcssa ; LINUX64LE-NEXT: andi. 5, 5, 1
diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll index cc38e25..4e0394e 100644 --- a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
@@ -189,8 +189,8 @@ ; CHECK-NEXT: cmplwi r4, 0 ; CHECK-NEXT: beq cr0, .LBB2_4 ; CHECK-NEXT: # %bb.1: # %bb3.preheader -; CHECK-NEXT: std r25, -56(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill ; CHECK-NEXT: addi r10, r3, 4002 ; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: li r5, -1 @@ -198,7 +198,6 @@ ; CHECK-NEXT: li r7, 3 ; CHECK-NEXT: li r8, 5 ; CHECK-NEXT: li r9, 9 -; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill @@ -215,7 +214,7 @@ ; CHECK-NEXT: ldx r28, r10, r8 ; CHECK-NEXT: ld r27, 12(r10) ; CHECK-NEXT: ld r26, 8(r10) -; CHECK-NEXT: ldx r25, r10, r9 +; CHECK-NEXT: ldx r12, r10, r9 ; CHECK-NEXT: addi r10, r10, 1 ; CHECK-NEXT: mulld r11, r11, r0 ; CHECK-NEXT: mulld r11, r11, r30 @@ -223,7 +222,7 @@ ; CHECK-NEXT: mulld r11, r11, r28 ; CHECK-NEXT: mulld r11, r11, r27 ; CHECK-NEXT: mulld r11, r11, r26 -; CHECK-NEXT: maddld r3, r11, r25, r3 +; CHECK-NEXT: maddld r3, r11, r12, r3 ; CHECK-NEXT: bdnz .LBB2_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -232,7 +231,6 @@ ; CHECK-NEXT: ld r27, -40(r1) # 8-byte Folded Reload ; CHECK-NEXT: add r3, r3, r4 ; CHECK-NEXT: ld r26, -48(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r25, -56(r1) # 8-byte Folded Reload ; CHECK-NEXT: blr ; CHECK-NEXT: .LBB2_4: ; CHECK-NEXT: addi r3, r4, 0
diff --git a/llvm/test/CodeGen/PowerPC/sink-side-effect.ll b/llvm/test/CodeGen/PowerPC/sink-side-effect.ll index 94d2a09..040c20b 100644 --- a/llvm/test/CodeGen/PowerPC/sink-side-effect.ll +++ b/llvm/test/CodeGen/PowerPC/sink-side-effect.ll
@@ -23,7 +23,7 @@ ; CHECK-NEXT: cmpw 4, 3 ; CHECK-NEXT: bge 0, .LBB0_3 ; CHECK-NEXT: # %bb.5: -; CHECK-NEXT: xsmuldp 0, 0, 1 +; CHECK-NEXT: xsmuldp 1, 1, 0 ; CHECK-NEXT: b .LBB0_3 bb: %tmp = load i32, ptr %arg, align 8
diff --git a/llvm/test/CodeGen/PowerPC/sms-phi-1.ll b/llvm/test/CodeGen/PowerPC/sms-phi-1.ll index 516d54b..5094570 100644 --- a/llvm/test/CodeGen/PowerPC/sms-phi-1.ll +++ b/llvm/test/CodeGen/PowerPC/sms-phi-1.ll
@@ -26,11 +26,12 @@ ; CHECK-NEXT: mullw 4, 6, 6 ; CHECK-NEXT: addi 5, 6, 1 ; CHECK-NEXT: bdz .LBB0_3 -; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: mr 6, 5 ; CHECK-NEXT: stwu 4, 4(3) -; CHECK-NEXT: mullw 4, 5, 5 ; CHECK-NEXT: addi 5, 5, 1 +; CHECK-NEXT: mullw 4, 6, 6 ; CHECK-NEXT: bdnz .LBB0_2 ; CHECK-NEXT: .LBB0_3: ; CHECK-NEXT: stwu 4, 4(3)
diff --git a/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll b/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll index 871aab3..9cb2d44 100644 --- a/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll +++ b/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll
@@ -145,14 +145,14 @@ ; CHECK32-NEXT: .align 4 ; CHECK32-NEXT: [[L2_foo:.*]]: # %for.body ; CHECK32-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK32-NEXT: slwi r8, r6, 4 -; CHECK32-NEXT: addic r6, r6, 1 -; CHECK32-NEXT: addze r7, r7 +; CHECK32-NEXT: slwi r8, r7, 4 +; CHECK32-NEXT: addic r7, r7, 1 +; CHECK32-NEXT: addze r6, r6 ; CHECK32-NEXT: lxvx vs2, r4, r8 ; CHECK32-NEXT: xvmaddmsp vs2, vs0, vs1 ; CHECK32-NEXT: stxvx vs2, r3, r8 -; CHECK32-NEXT: xor r8, r6, r5 -; CHECK32-NEXT: or. r8, r8, r7 +; CHECK32-NEXT: xor r8, r7, r5 +; CHECK32-NEXT: or. r8, r8, r6 ; CHECK32-NEXT: bne cr0, [[L2_foo]] ; CHECK32: .foo:
diff --git a/llvm/test/CodeGen/RISCV/branch-on-zero.ll b/llvm/test/CodeGen/RISCV/branch-on-zero.ll index 02aeebd..2aec92e 100644 --- a/llvm/test/CodeGen/RISCV/branch-on-zero.ll +++ b/llvm/test/CodeGen/RISCV/branch-on-zero.ll
@@ -127,13 +127,11 @@ ; RV32-NEXT: .LBB3_2: # %while.body ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32-NEXT: lw a3, 0(a1) -; RV32-NEXT: addi a4, a1, 4 +; RV32-NEXT: addi a1, a1, 4 ; RV32-NEXT: slli a3, a3, 1 -; RV32-NEXT: addi a1, a0, 4 ; RV32-NEXT: sw a3, 0(a0) -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: mv a1, a4 -; RV32-NEXT: bne a4, a2, .LBB3_2 +; RV32-NEXT: addi a0, a0, 4 +; RV32-NEXT: bne a1, a2, .LBB3_2 ; RV32-NEXT: .LBB3_3: # %while.end ; RV32-NEXT: li a0, 0 ; RV32-NEXT: ret @@ -151,13 +149,11 @@ ; RV64-NEXT: .LBB3_2: # %while.body ; RV64-NEXT: # =>This Inner Loop Header: Depth=1 ; RV64-NEXT: lw a3, 0(a1) -; RV64-NEXT: addi a4, a1, 4 +; RV64-NEXT: addi a1, a1, 4 ; RV64-NEXT: slli a3, a3, 1 -; RV64-NEXT: addi a1, a0, 4 ; RV64-NEXT: sw a3, 0(a0) -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: mv a1, a4 -; RV64-NEXT: bne a4, a2, .LBB3_2 +; RV64-NEXT: addi a0, a0, 4 +; RV64-NEXT: bne a1, a2, .LBB3_2 ; RV64-NEXT: .LBB3_3: # %while.end ; RV64-NEXT: li a0, 0 ; RV64-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll index 3250821..2b800c4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
@@ -44,9 +44,8 @@ ; CHECK-LABEL: m2_splat_with_tail: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vrgather.vi v10, v8, 0 -; CHECK-NEXT: vmv1r.v v11, v9 -; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vrgather.vi v8, v10, 0 ; CHECK-NEXT: ret %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3> ret <4 x i64> %res @@ -99,9 +98,8 @@ ; CHECK-LABEL: m2_splat_into_identity: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vrgather.vi v10, v8, 0 -; CHECK-NEXT: vmv1r.v v11, v9 -; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vrgather.vi v8, v10, 0 ; CHECK-NEXT: ret %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3> ret <4 x i64> %res
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll index ab98496..a4c793b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll +++ b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
@@ -36,7 +36,7 @@ ; CHECK-NEXT: .cfi_offset s10, -96 ; CHECK-NEXT: .cfi_offset s11, -104 ; CHECK-NEXT: li a6, 0 -; CHECK-NEXT: li s2, 8 +; CHECK-NEXT: li a7, 8 ; CHECK-NEXT: li t0, 12 ; CHECK-NEXT: li s0, 4 ; CHECK-NEXT: li t1, 20 @@ -45,7 +45,7 @@ ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: andi t3, a4, 1 -; CHECK-NEXT: li t2, 4 +; CHECK-NEXT: li s2, 4 ; CHECK-NEXT: .LBB0_1: # %for.cond1.preheader.i ; CHECK-NEXT: # =>This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB0_2 Depth 2 @@ -53,9 +53,9 @@ ; CHECK-NEXT: # Child Loop BB0_4 Depth 4 ; CHECK-NEXT: # Child Loop BB0_5 Depth 5 ; CHECK-NEXT: mv t4, t1 -; CHECK-NEXT: mv t5, t2 +; CHECK-NEXT: mv t2, s2 ; CHECK-NEXT: mv t6, t0 -; CHECK-NEXT: mv a7, s2 +; CHECK-NEXT: mv s3, a7 ; CHECK-NEXT: mv s4, a6 ; CHECK-NEXT: .LBB0_2: # %for.cond5.preheader.i ; CHECK-NEXT: # Parent Loop BB0_1 Depth=1 @@ -64,9 +64,9 @@ ; CHECK-NEXT: # Child Loop BB0_4 Depth 4 ; CHECK-NEXT: # Child Loop BB0_5 Depth 5 ; CHECK-NEXT: mv s5, t4 -; CHECK-NEXT: mv s6, t5 +; CHECK-NEXT: mv t5, t2 ; CHECK-NEXT: mv s7, t6 -; CHECK-NEXT: mv s3, a7 +; CHECK-NEXT: mv s8, s3 ; CHECK-NEXT: mv s9, s4 ; CHECK-NEXT: .LBB0_3: # %for.cond9.preheader.i ; CHECK-NEXT: # Parent Loop BB0_1 Depth=1 @@ -75,9 +75,9 @@ ; CHECK-NEXT: # Child Loop BB0_4 Depth 4 ; CHECK-NEXT: # Child Loop BB0_5 Depth 5 ; CHECK-NEXT: mv s11, s5 -; CHECK-NEXT: mv a3, s6 +; CHECK-NEXT: mv s6, t5 ; CHECK-NEXT: mv ra, s7 -; CHECK-NEXT: mv s8, s3 +; CHECK-NEXT: mv a5, s8 ; CHECK-NEXT: mv s1, s9 ; CHECK-NEXT: .LBB0_4: # %vector.ph.i ; CHECK-NEXT: # Parent Loop BB0_1 Depth=1 @@ -92,45 +92,44 @@ ; CHECK-NEXT: # Parent Loop BB0_3 Depth=3 ; CHECK-NEXT: # Parent Loop BB0_4 Depth=4 ; CHECK-NEXT: # => This Inner Loop Header: Depth=5 -; CHECK-NEXT: addi a5, a1, 4 -; CHECK-NEXT: add a4, s8, a1 -; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: add a4, a5, a1 +; CHECK-NEXT: add a3, s6, a1 +; CHECK-NEXT: addi a1, a1, 4 ; CHECK-NEXT: vse32.v v8, (a4), v0.t -; CHECK-NEXT: vse32.v v8, (a1), v0.t -; CHECK-NEXT: mv a1, a5 -; CHECK-NEXT: bne a5, s0, .LBB0_5 +; CHECK-NEXT: vse32.v v8, (a3), v0.t +; CHECK-NEXT: bne a1, s0, .LBB0_5 ; CHECK-NEXT: # %bb.6: # %for.cond.cleanup15.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=4 ; CHECK-NEXT: addi s1, s1, 4 -; CHECK-NEXT: addi s8, s8, 4 +; CHECK-NEXT: addi a5, a5, 4 ; CHECK-NEXT: addi ra, ra, 4 -; CHECK-NEXT: addi a3, a3, 4 +; CHECK-NEXT: addi s6, s6, 4 ; CHECK-NEXT: andi s10, a0, 1 ; CHECK-NEXT: addi s11, s11, 4 ; CHECK-NEXT: beqz s10, .LBB0_4 ; CHECK-NEXT: # %bb.7: # %for.cond.cleanup11.i ; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=3 ; CHECK-NEXT: addi s9, s9, 4 -; CHECK-NEXT: addi s3, s3, 4 +; CHECK-NEXT: addi s8, s8, 4 ; CHECK-NEXT: addi s7, s7, 4 -; CHECK-NEXT: addi s6, s6, 4 +; CHECK-NEXT: addi t5, t5, 4 ; CHECK-NEXT: andi a1, a2, 1 ; CHECK-NEXT: addi s5, s5, 4 ; CHECK-NEXT: beqz a1, .LBB0_3 ; CHECK-NEXT: # %bb.8: # %for.cond.cleanup7.i ; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=2 ; CHECK-NEXT: addi s4, s4, 4 -; CHECK-NEXT: addi a7, a7, 4 +; CHECK-NEXT: addi s3, s3, 4 ; CHECK-NEXT: addi t6, t6, 4 -; CHECK-NEXT: addi t5, t5, 4 +; CHECK-NEXT: addi t2, t2, 4 ; CHECK-NEXT: addi t4, t4, 4 ; CHECK-NEXT: beqz t3, .LBB0_2 ; CHECK-NEXT: # %bb.9: # %for.cond.cleanup3.i ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: addi a6, a6, 4 -; CHECK-NEXT: addi s2, s2, 4 +; CHECK-NEXT: addi a7, a7, 4 ; CHECK-NEXT: addi t0, t0, 4 -; CHECK-NEXT: addi t2, t2, 4 +; CHECK-NEXT: addi s2, s2, 4 ; CHECK-NEXT: addi t1, t1, 4 ; CHECK-NEXT: beqz a1, .LBB0_1 ; CHECK-NEXT: # %bb.10: # %l.exit
diff --git a/llvm/test/CodeGen/RISCV/rvv/remat.ll b/llvm/test/CodeGen/RISCV/rvv/remat.ll index 57f1977..95bff27 100644 --- a/llvm/test/CodeGen/RISCV/rvv/remat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/remat.ll
@@ -314,13 +314,10 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a2, a2, a1 -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: slli a2, a1, 5 +; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x19, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 25 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 @@ -329,7 +326,7 @@ ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: srli a1, a2, 3 ; CHECK-NEXT: slli a2, a2, 3 @@ -337,6 +334,15 @@ ; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: .LBB8_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: slli a4, a4, 3 +; CHECK-NEXT: add a5, a5, a4 +; CHECK-NEXT: slli a4, a4, 1 +; CHECK-NEXT: add a4, a4, a5 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: csrr a4, vlenb ; CHECK-NEXT: slli a5, a4, 4 @@ -344,23 +350,33 @@ ; CHECK-NEXT: add a4, sp, a4 ; CHECK-NEXT: addi a4, a4, 16 ; CHECK-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vand.vv v16, v0, v8 -; CHECK-NEXT: vmv8r.v v8, v24 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: slli a4, a4, 3 +; CHECK-NEXT: add a5, a5, a4 +; CHECK-NEXT: slli a4, a4, 1 +; CHECK-NEXT: add a4, a4, a5 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vand.vv v16, v16, v8 ; CHECK-NEXT: vmsne.vi v24, v16, 0 ; CHECK-NEXT: csrr a4, vlenb ; CHECK-NEXT: slli a4, a4, 4 ; CHECK-NEXT: add a4, sp, a4 ; CHECK-NEXT: addi a4, a4, 16 ; CHECK-NEXT: vs1r.v v24, (a4) # vscale x 8-byte Folded Spill -; CHECK-NEXT: vmv8r.v v24, v8 +; CHECK-NEXT: vand.vv v16, v0, v8 +; CHECK-NEXT: vmsne.vi v8, v16, 0 ; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: slli a5, a4, 4 -; CHECK-NEXT: add a4, a5, a4 +; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: slli a4, a4, 3 +; CHECK-NEXT: add a5, a5, a4 +; CHECK-NEXT: slli a4, a4, 1 +; CHECK-NEXT: add a4, a4, a5 ; CHECK-NEXT: add a4, sp, a4 ; CHECK-NEXT: addi a4, a4, 16 -; CHECK-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vand.vv v16, v24, v8 -; CHECK-NEXT: vmsne.vi v8, v16, 0 +; CHECK-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload ; CHECK-NEXT: csrr a4, vlenb ; CHECK-NEXT: slli a4, a4, 4 ; CHECK-NEXT: add a4, sp, a4 @@ -381,22 +397,19 @@ ; CHECK-NEXT: addi a5, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli a5, zero, e64, m8, ta, ma -; CHECK-NEXT: vor.vv v0, v0, v8 +; CHECK-NEXT: vor.vv v16, v16, v8 ; CHECK-NEXT: csrr a5, vlenb ; CHECK-NEXT: slli a5, a5, 3 ; CHECK-NEXT: add a5, sp, a5 ; CHECK-NEXT: addi a5, a5, 16 ; CHECK-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vor.vv v24, v24, v8 +; CHECK-NEXT: vor.vv v0, v0, v8 ; CHECK-NEXT: beqz a4, .LBB8_1 ; CHECK-NEXT: # %bb.2: # %middle.block ; CHECK-NEXT: andi a0, a0, 1 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a2, a2, a1 -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: slli a2, a1, 5 +; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: add sp, sp, a1 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll index f295bd8..386c736 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
@@ -2258,18 +2258,18 @@ ; CHECK-RV32-NEXT: vsetvli a7, zero, e32, m2, ta, ma ; CHECK-RV32-NEXT: .LBB98_3: # %vector.body ; CHECK-RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-RV32-NEXT: slli a7, a6, 2 -; CHECK-RV32-NEXT: add t0, a6, a4 -; CHECK-RV32-NEXT: add a7, a0, a7 -; CHECK-RV32-NEXT: vl2re32.v v8, (a7) -; CHECK-RV32-NEXT: sltu a6, t0, a6 -; CHECK-RV32-NEXT: add a5, a5, a6 -; CHECK-RV32-NEXT: xor a6, t0, a3 +; CHECK-RV32-NEXT: mv a7, a6 +; CHECK-RV32-NEXT: slli t0, a6, 2 +; CHECK-RV32-NEXT: add a6, a6, a4 +; CHECK-RV32-NEXT: add t0, a0, t0 +; CHECK-RV32-NEXT: vl2re32.v v8, (t0) +; CHECK-RV32-NEXT: sltu a7, a6, a7 +; CHECK-RV32-NEXT: add a5, a5, a7 +; CHECK-RV32-NEXT: xor a7, a6, a3 ; CHECK-RV32-NEXT: vand.vx v8, v8, a1 -; CHECK-RV32-NEXT: or t1, a6, a5 -; CHECK-RV32-NEXT: vs2r.v v8, (a7) -; CHECK-RV32-NEXT: mv a6, t0 -; CHECK-RV32-NEXT: bnez t1, .LBB98_3 +; CHECK-RV32-NEXT: or a7, a7, a5 +; CHECK-RV32-NEXT: vs2r.v v8, (t0) +; CHECK-RV32-NEXT: bnez a7, .LBB98_3 ; CHECK-RV32-NEXT: # %bb.4: # %middle.block ; CHECK-RV32-NEXT: bnez a3, .LBB98_6 ; CHECK-RV32-NEXT: .LBB98_5: # %for.body @@ -2350,18 +2350,18 @@ ; CHECK-ZVKB-NOZBB32-NEXT: vsetvli a7, zero, e32, m2, ta, ma ; CHECK-ZVKB-NOZBB32-NEXT: .LBB98_3: # %vector.body ; CHECK-ZVKB-NOZBB32-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-ZVKB-NOZBB32-NEXT: slli a7, a6, 2 -; CHECK-ZVKB-NOZBB32-NEXT: add t0, a6, a4 -; CHECK-ZVKB-NOZBB32-NEXT: add a7, a0, a7 -; CHECK-ZVKB-NOZBB32-NEXT: vl2re32.v v8, (a7) -; CHECK-ZVKB-NOZBB32-NEXT: sltu a6, t0, a6 -; CHECK-ZVKB-NOZBB32-NEXT: add a5, a5, a6 -; CHECK-ZVKB-NOZBB32-NEXT: xor a6, t0, a3 +; CHECK-ZVKB-NOZBB32-NEXT: mv a7, a6 +; CHECK-ZVKB-NOZBB32-NEXT: slli t0, a6, 2 +; CHECK-ZVKB-NOZBB32-NEXT: add a6, a6, a4 +; CHECK-ZVKB-NOZBB32-NEXT: add t0, a0, t0 +; CHECK-ZVKB-NOZBB32-NEXT: vl2re32.v v8, (t0) +; CHECK-ZVKB-NOZBB32-NEXT: sltu a7, a6, a7 +; CHECK-ZVKB-NOZBB32-NEXT: add a5, a5, a7 +; CHECK-ZVKB-NOZBB32-NEXT: xor a7, a6, a3 ; CHECK-ZVKB-NOZBB32-NEXT: vandn.vx v8, v8, a1 -; CHECK-ZVKB-NOZBB32-NEXT: or t1, a6, a5 -; CHECK-ZVKB-NOZBB32-NEXT: vs2r.v v8, (a7) -; CHECK-ZVKB-NOZBB32-NEXT: mv a6, t0 -; CHECK-ZVKB-NOZBB32-NEXT: bnez t1, .LBB98_3 +; CHECK-ZVKB-NOZBB32-NEXT: or a7, a7, a5 +; CHECK-ZVKB-NOZBB32-NEXT: vs2r.v v8, (t0) +; CHECK-ZVKB-NOZBB32-NEXT: bnez a7, .LBB98_3 ; CHECK-ZVKB-NOZBB32-NEXT: # %bb.4: # %middle.block ; CHECK-ZVKB-NOZBB32-NEXT: bnez a3, .LBB98_7 ; CHECK-ZVKB-NOZBB32-NEXT: .LBB98_5: # %for.body.preheader @@ -2444,18 +2444,18 @@ ; CHECK-ZVKB-ZBB32-NEXT: vsetvli a7, zero, e32, m2, ta, ma ; CHECK-ZVKB-ZBB32-NEXT: .LBB98_3: # %vector.body ; CHECK-ZVKB-ZBB32-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-ZVKB-ZBB32-NEXT: slli a7, a6, 2 -; CHECK-ZVKB-ZBB32-NEXT: add t0, a6, a4 -; CHECK-ZVKB-ZBB32-NEXT: add a7, a0, a7 -; CHECK-ZVKB-ZBB32-NEXT: vl2re32.v v8, (a7) -; CHECK-ZVKB-ZBB32-NEXT: sltu a6, t0, a6 -; CHECK-ZVKB-ZBB32-NEXT: add a5, a5, a6 -; CHECK-ZVKB-ZBB32-NEXT: xor a6, t0, a3 +; CHECK-ZVKB-ZBB32-NEXT: mv a7, a6 +; CHECK-ZVKB-ZBB32-NEXT: slli t0, a6, 2 +; CHECK-ZVKB-ZBB32-NEXT: add a6, a6, a4 +; CHECK-ZVKB-ZBB32-NEXT: add t0, a0, t0 +; CHECK-ZVKB-ZBB32-NEXT: vl2re32.v v8, (t0) +; CHECK-ZVKB-ZBB32-NEXT: sltu a7, a6, a7 +; CHECK-ZVKB-ZBB32-NEXT: add a5, a5, a7 +; CHECK-ZVKB-ZBB32-NEXT: xor a7, a6, a3 ; CHECK-ZVKB-ZBB32-NEXT: vandn.vx v8, v8, a1 -; CHECK-ZVKB-ZBB32-NEXT: or t1, a6, a5 -; CHECK-ZVKB-ZBB32-NEXT: vs2r.v v8, (a7) -; CHECK-ZVKB-ZBB32-NEXT: mv a6, t0 -; CHECK-ZVKB-ZBB32-NEXT: bnez t1, .LBB98_3 +; CHECK-ZVKB-ZBB32-NEXT: or a7, a7, a5 +; CHECK-ZVKB-ZBB32-NEXT: vs2r.v v8, (t0) +; CHECK-ZVKB-ZBB32-NEXT: bnez a7, .LBB98_3 ; CHECK-ZVKB-ZBB32-NEXT: # %bb.4: # %middle.block ; CHECK-ZVKB-ZBB32-NEXT: bnez a3, .LBB98_6 ; CHECK-ZVKB-ZBB32-NEXT: .LBB98_5: # %for.body
diff --git a/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll index ed6b7f1..1044008 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll
@@ -25,24 +25,24 @@ ; RV32-NEXT: li a6, 0 ; RV32-NEXT: .LBB0_4: # %vector.body ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NEXT: slli t0, a7, 2 -; RV32-NEXT: addi t1, a7, 8 -; RV32-NEXT: add t0, a1, t0 +; RV32-NEXT: mv t0, a7 +; RV32-NEXT: slli t1, a7, 2 +; RV32-NEXT: addi a7, a7, 8 +; RV32-NEXT: add t1, a1, t1 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vle32.v v8, (t0) -; RV32-NEXT: sltu a7, t1, a7 -; RV32-NEXT: xor t0, t1, a5 -; RV32-NEXT: add a6, a6, a7 +; RV32-NEXT: vle32.v v8, (t1) +; RV32-NEXT: sltu t0, a7, t0 +; RV32-NEXT: xor t1, a7, a5 +; RV32-NEXT: add a6, a6, t0 ; RV32-NEXT: vmslt.vx v12, v8, a2 ; RV32-NEXT: vcompress.vm v10, v8, v12 -; RV32-NEXT: vcpop.m a7, v12 -; RV32-NEXT: vsetvli zero, a7, e32, m2, ta, ma +; RV32-NEXT: vcpop.m t0, v12 +; RV32-NEXT: vsetvli zero, t0, e32, m2, ta, ma ; RV32-NEXT: vse32.v v10, (a0) -; RV32-NEXT: slli a7, a7, 2 -; RV32-NEXT: or t0, t0, a6 -; RV32-NEXT: add a0, a0, a7 -; RV32-NEXT: mv a7, t1 -; RV32-NEXT: bnez t0, .LBB0_4 +; RV32-NEXT: slli t0, t0, 2 +; RV32-NEXT: or t1, t1, a6 +; RV32-NEXT: add a0, a0, t0 +; RV32-NEXT: bnez t1, .LBB0_4 ; RV32-NEXT: # %bb.5: # %middle.block ; RV32-NEXT: bne a5, a3, .LBB0_9 ; RV32-NEXT: .LBB0_6: # %for.cond.cleanup
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll index 2293a1e..e8d89d4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
@@ -895,21 +895,21 @@ ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: .LBB43_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vsetvli a3, a2, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: slli a3, a1, 32 +; CHECK-NEXT: vsetvli a1, a2, e8, mf8, ta, ma ; CHECK-NEXT: vsetivli zero, 0, e8, mf2, ta, mu ; CHECK-NEXT: vmv.v.i v10, 0 -; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: srli a3, a3, 32 ; CHECK-NEXT: vmerge.vim v10, v10, 1, v0 -; CHECK-NEXT: vslideup.vx v10, v9, a1, v0.t +; CHECK-NEXT: vslideup.vx v10, v9, a3, v0.t ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vmsne.vi v0, v10, 0, v0.t -; CHECK-NEXT: vsetvli zero, a3, e32, m2, ta, ma +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma ; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vse32.v v10, (a0), v0.t ; CHECK-NEXT: li a2, 1 -; CHECK-NEXT: mv a1, a3 ; CHECK-NEXT: j .LBB43_1 entry: br label %vector.body
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll index ead79fc..af3b085 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
@@ -102,20 +102,20 @@ ; RV32-NEXT: .LBB0_13: # %vector.body ; RV32-NEXT: # Parent Loop BB0_10 Depth=1 ; RV32-NEXT: # => This Inner Loop Header: Depth=2 -; RV32-NEXT: add s0, a2, t6 -; RV32-NEXT: add s1, a4, t6 -; RV32-NEXT: vl2r.v v8, (s0) -; RV32-NEXT: add s0, a0, t6 +; RV32-NEXT: mv s0, t6 +; RV32-NEXT: add t6, a2, t6 +; RV32-NEXT: add s1, a4, s0 +; RV32-NEXT: vl2r.v v8, (t6) +; RV32-NEXT: add s2, a0, s0 ; RV32-NEXT: vl2r.v v10, (s1) -; RV32-NEXT: add s1, t6, t2 -; RV32-NEXT: sltu t6, s1, t6 -; RV32-NEXT: add t5, t5, t6 -; RV32-NEXT: xor t6, s1, t4 +; RV32-NEXT: add t6, s0, t2 +; RV32-NEXT: sltu s0, t6, s0 +; RV32-NEXT: add t5, t5, s0 +; RV32-NEXT: xor s0, t6, t4 ; RV32-NEXT: vaaddu.vv v8, v8, v10 -; RV32-NEXT: or s2, t6, t5 -; RV32-NEXT: vs2r.v v8, (s0) -; RV32-NEXT: mv t6, s1 -; RV32-NEXT: bnez s2, .LBB0_13 +; RV32-NEXT: or s0, s0, t5 +; RV32-NEXT: vs2r.v v8, (s2) +; RV32-NEXT: bnez s0, .LBB0_13 ; RV32-NEXT: # %bb.14: # %middle.block ; RV32-NEXT: # in Loop: Header=BB0_10 Depth=1 ; RV32-NEXT: beq t4, a6, .LBB0_9
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll index 1769c5d..98e082b 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@@ -21,11 +21,12 @@ ; ENABLED-NEXT: it lt ; ENABLED-NEXT: bxlt lr ; ENABLED-NEXT: .LBB0_1: @ %for.body.lr.ph -; ENABLED-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr} +; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; ENABLED-NEXT: mov r11, r0 -; ENABLED-NEXT: ldr r0, [sp, #32] +; ENABLED-NEXT: ldr r0, [sp, #36] ; ENABLED-NEXT: add.w r9, r2, #3 ; ENABLED-NEXT: mov.w r12, #0 +; ENABLED-NEXT: mov.w r8, #1 ; ENABLED-NEXT: mov r10, r11 ; ENABLED-NEXT: uxth r0, r0 ; ENABLED-NEXT: rsbs r5, r0, #0 @@ -49,18 +50,16 @@ ; ENABLED-NEXT: @ %bb.5: @ %vector.ph ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLED-NEXT: bic r0, r9, #3 -; ENABLED-NEXT: movs r7, #1 -; ENABLED-NEXT: subs r0, #4 ; ENABLED-NEXT: sub.w r4, r2, r12 +; ENABLED-NEXT: subs r0, #4 ; ENABLED-NEXT: vmov.i32 q1, #0x0 -; ENABLED-NEXT: add.w r6, r7, r0, lsr #2 +; ENABLED-NEXT: mov r7, r10 +; ENABLED-NEXT: add.w r6, r8, r0, lsr #2 ; ENABLED-NEXT: adds r0, r2, #3 ; ENABLED-NEXT: sub.w r0, r0, r12 ; ENABLED-NEXT: bic r0, r0, #3 ; ENABLED-NEXT: subs r0, #4 -; ENABLED-NEXT: add.w r0, r7, r0, lsr #2 -; ENABLED-NEXT: mov r7, r10 -; ENABLED-NEXT: dls lr, r0 +; ENABLED-NEXT: add.w lr, r8, r0, lsr #2 ; ENABLED-NEXT: mov r0, r11 ; ENABLED-NEXT: .LBB0_6: @ %vector.body ; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1 @@ -83,7 +82,7 @@ ; ENABLED-NEXT: vaddv.u32 r0, q0 ; ENABLED-NEXT: b .LBB0_3 ; ENABLED-NEXT: .LBB0_8: -; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr} +; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; ENABLED-NEXT: bx lr ; ; NOREDUCTIONS-LABEL: varying_outer_2d_reduction: @@ -92,11 +91,12 @@ ; NOREDUCTIONS-NEXT: it lt ; NOREDUCTIONS-NEXT: bxlt lr ; NOREDUCTIONS-NEXT: .LBB0_1: @ %for.body.lr.ph -; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr} +; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; NOREDUCTIONS-NEXT: mov r11, r0 -; NOREDUCTIONS-NEXT: ldr r0, [sp, #32] +; NOREDUCTIONS-NEXT: ldr r0, [sp, #36] ; NOREDUCTIONS-NEXT: add.w r9, r2, #3 ; NOREDUCTIONS-NEXT: mov.w r12, #0 +; NOREDUCTIONS-NEXT: mov.w r8, #1 ; NOREDUCTIONS-NEXT: mov r10, r11 ; NOREDUCTIONS-NEXT: uxth r0, r0 ; NOREDUCTIONS-NEXT: rsbs r5, r0, #0 @@ -120,18 +120,16 @@ ; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: bic r0, r9, #3 -; NOREDUCTIONS-NEXT: movs r7, #1 -; NOREDUCTIONS-NEXT: subs r0, #4 ; NOREDUCTIONS-NEXT: sub.w r4, r2, r12 +; NOREDUCTIONS-NEXT: subs r0, #4 ; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0 -; NOREDUCTIONS-NEXT: add.w r6, r7, r0, lsr #2 +; NOREDUCTIONS-NEXT: mov r7, r10 +; NOREDUCTIONS-NEXT: add.w r6, r8, r0, lsr #2 ; NOREDUCTIONS-NEXT: adds r0, r2, #3 ; NOREDUCTIONS-NEXT: sub.w r0, r0, r12 ; NOREDUCTIONS-NEXT: bic r0, r0, #3 ; NOREDUCTIONS-NEXT: subs r0, #4 -; NOREDUCTIONS-NEXT: add.w r0, r7, r0, lsr #2 -; NOREDUCTIONS-NEXT: mov r7, r10 -; NOREDUCTIONS-NEXT: dls lr, r0 +; NOREDUCTIONS-NEXT: add.w lr, r8, r0, lsr #2 ; NOREDUCTIONS-NEXT: mov r0, r11 ; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body ; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1 @@ -154,7 +152,7 @@ ; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0 ; NOREDUCTIONS-NEXT: b .LBB0_3 ; NOREDUCTIONS-NEXT: .LBB0_8: -; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr} +; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; NOREDUCTIONS-NEXT: bx lr entry: %conv = sext i16 %N to i32
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll index cbcbf1f..435acc2 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
@@ -165,74 +165,73 @@ ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: wls lr, r1, .LBB2_3 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader -; CHECK-NEXT: adds r6, r3, #4 -; CHECK-NEXT: adds r1, r0, #4 +; CHECK-NEXT: add.w r9, r3, #4 +; CHECK-NEXT: add.w r10, r0, #4 ; CHECK-NEXT: mvn r8, #1 -; CHECK-NEXT: @ implicit-def: $r9 +; CHECK-NEXT: @ implicit-def: $r6 ; CHECK-NEXT: @ implicit-def: $r4 ; CHECK-NEXT: str r2, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB2_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: ldr.w r1, [r10] ; CHECK-NEXT: asrs r2, r4, #31 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: muls r1, r3, r1 ; CHECK-NEXT: adds r4, r4, r1 ; CHECK-NEXT: adc.w r1, r2, r1, asr #31 ; CHECK-NEXT: adds.w r2, r4, #-2147483648 -; CHECK-NEXT: ldrd r2, r4, [r8] -; CHECK-NEXT: adc r5, r1, #0 -; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: smull r4, r2, r4, r9 -; CHECK-NEXT: asrs r1, r5, #31 +; CHECK-NEXT: ldrd r5, r4, [r8] +; CHECK-NEXT: adc r2, r1, #0 ; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: subs r4, r5, r4 -; CHECK-NEXT: sbcs r1, r2 -; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds.w r10, r4, #-2147483648 -; CHECK-NEXT: adc r1, r1, #0 -; CHECK-NEXT: ldr r4, [r2, #-4] +; CHECK-NEXT: smull r4, r5, r4, r6 +; CHECK-NEXT: asrs r1, r2, #31 +; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: subs r4, r2, r4 +; CHECK-NEXT: sbcs r1, r5 +; CHECK-NEXT: adds.w r6, r4, #-2147483648 +; CHECK-NEXT: ldr r4, [r10, #-4] +; CHECK-NEXT: adc r11, r1, #0 +; CHECK-NEXT: mov r1, r9 +; CHECK-NEXT: add.w r10, r10, #4 ; CHECK-NEXT: muls r4, r3, r4 ; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: adds.w r12, r4, #-2147483648 ; CHECK-NEXT: asr.w r5, r4, #31 -; CHECK-NEXT: ldr r4, [r6] +; CHECK-NEXT: ldr.w r4, [r9] ; CHECK-NEXT: adc r5, r5, #0 ; CHECK-NEXT: mul r2, r4, r0 -; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: add.w r2, r2, #-2147483648 ; CHECK-NEXT: asrl r12, r5, r2 -; CHECK-NEXT: smull r2, r5, r4, r12 -; CHECK-NEXT: lsll r2, r5, #30 -; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: asr.w r11, r5, #31 -; CHECK-NEXT: mov r12, r5 -; CHECK-NEXT: lsll r12, r11, r4 -; CHECK-NEXT: mul r2, r2, r9 -; CHECK-NEXT: lsrl r12, r11, #2 -; CHECK-NEXT: adds r2, #2 -; CHECK-NEXT: lsll r12, r11, r2 +; CHECK-NEXT: smull r2, r9, r4, r12 +; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: lsll r2, r9, #30 +; CHECK-NEXT: asr.w r5, r9, #31 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r9, r1 +; CHECK-NEXT: ldrd r1, r0, [sp, #4] @ 8-byte Folded Reload +; CHECK-NEXT: lsll r2, r5, r4 +; CHECK-NEXT: lsrl r2, r5, #2 +; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: lsll r2, r5, r0 +; CHECK-NEXT: add.w r0, r2, #-2147483648 ; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload -; CHECK-NEXT: add.w r5, r12, #-2147483648 -; CHECK-NEXT: asrl r10, r1, r5 -; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: lsrl r10, r1, #2 -; CHECK-NEXT: movs r1, #2 -; CHECK-NEXT: mov r9, r10 -; CHECK-NEXT: str.w r10, [r1] -; CHECK-NEXT: ldr r1, [r8], #-4 -; CHECK-NEXT: mls r5, r1, r4, r5 -; CHECK-NEXT: adds.w r4, r5, #-2147483648 -; CHECK-NEXT: asr.w r1, r5, #31 +; CHECK-NEXT: asrl r6, r11, r0 +; CHECK-NEXT: movs r0, #2 +; CHECK-NEXT: lsrl r6, r11, #2 +; CHECK-NEXT: str r6, [r0] +; CHECK-NEXT: ldr r0, [r8], #-4 +; CHECK-NEXT: mls r0, r0, r4, r1 +; CHECK-NEXT: adds.w r4, r0, #-2147483648 +; CHECK-NEXT: asr.w r1, r0, #31 ; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: lsrl r4, r1, #2 -; CHECK-NEXT: rsbs r1, r4, #0 -; CHECK-NEXT: str r1, [r2] -; CHECK-NEXT: str r1, [r6, #-4] -; CHECK-NEXT: adds r6, #4 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds r1, #4 +; CHECK-NEXT: rsbs r0, r4, #0 +; CHECK-NEXT: str r0, [r2] +; CHECK-NEXT: str r0, [r9, #-4] +; CHECK-NEXT: add.w r9, r9, #4 +; CHECK-NEXT: add.w r0, r12, #4 ; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: .LBB2_3: @ %while.end ; CHECK-NEXT: add sp, #16
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll index 0d86f22..b60ee7c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -1313,27 +1313,29 @@ ; CHECK-NEXT: @ Child Loop BB16_3 Depth 2 ; CHECK-NEXT: ldr.w r8, [sp, #56] @ 4-byte Reload ; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: .LBB16_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB16_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vadd.i32 q1, q5, r0 +; CHECK-NEXT: vmov q0, q6 +; CHECK-NEXT: vadd.i32 q6, q5, r0 +; CHECK-NEXT: vmov r7, r3, d13 ; CHECK-NEXT: vadd.i32 q2, q4, r0 -; CHECK-NEXT: vmov r7, r3, d3 -; CHECK-NEXT: vadd.i32 q6, q0, lr ; CHECK-NEXT: vmov r5, r6, d5 +; CHECK-NEXT: vmov q1, q7 +; CHECK-NEXT: vmov r4, r10, d12 +; CHECK-NEXT: vadd.i32 q6, q0, lr ; CHECK-NEXT: subs.w r9, r9, #16 -; CHECK-NEXT: vmov r4, r10, d2 -; CHECK-NEXT: vadd.i32 q1, q7, lr ; CHECK-NEXT: vadd.i32 q4, q4, lr ; CHECK-NEXT: vadd.i32 q5, q5, lr +; CHECK-NEXT: vadd.i32 q7, q7, lr ; CHECK-NEXT: ldrb.w r11, [r3] ; CHECK-NEXT: ldrb r3, [r7] ; CHECK-NEXT: vmov r7, r12, d4 -; CHECK-NEXT: vadd.i32 q2, q7, r0 -; CHECK-NEXT: vadd.i32 q7, q0, r0 +; CHECK-NEXT: vadd.i32 q2, q1, r0 +; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: ldrb r6, [r6] ; CHECK-NEXT: ldrb r4, [r4] @@ -1342,7 +1344,7 @@ ; CHECK-NEXT: ldrb.w r1, [r12] ; CHECK-NEXT: vmov.8 q0[0], r7 ; CHECK-NEXT: vmov.8 q0[1], r1 -; CHECK-NEXT: vmov r1, r7, d15 +; CHECK-NEXT: vmov r1, r7, d3 ; CHECK-NEXT: vmov.8 q0[2], r5 ; CHECK-NEXT: vmov.8 q0[3], r6 ; CHECK-NEXT: vmov.8 q0[4], r4 @@ -1357,8 +1359,7 @@ ; CHECK-NEXT: ldrb r3, [r5] ; CHECK-NEXT: ldrb.w r12, [r7] ; CHECK-NEXT: ldrb r5, [r4] -; CHECK-NEXT: vmov r4, r7, d14 -; CHECK-NEXT: vmov q7, q1 +; CHECK-NEXT: vmov r4, r7, d2 ; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: vmov.8 q0[8], r4 @@ -1370,7 +1371,6 @@ ; CHECK-NEXT: vmov.8 q0[14], r3 ; CHECK-NEXT: vmov.8 q0[15], r12 ; CHECK-NEXT: vstrb.8 q0, [r8], #16 -; CHECK-NEXT: vmov q0, q6 ; CHECK-NEXT: bne .LBB16_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB16_2 Depth=1
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll index eedca2c..c0b2da7 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -236,11 +236,11 @@ ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r0, q1, uxtw #2] -; CHECK-NEXT: vadd.i32 q3, q1, q0 +; CHECK-NEXT: vldrw.u32 q3, [r0, q1, uxtw #2] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2] -; CHECK-NEXT: vmov q1, q3 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vstrw.32 q3, [r0, q2, uxtw #2] ; CHECK-NEXT: bne .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: bx lr @@ -330,20 +330,20 @@ ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r4, .LCPI7_0 ; CHECK-NEXT: mov.w r12, #9 -; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: mov.w lr, #12 ; CHECK-NEXT: movs r4, #8 -; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: vdup.32 q1, r0 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vadd.i32 q2, q1, r4 -; CHECK-NEXT: vmla.i32 q3, q1, lr -; CHECK-NEXT: vmul.i32 q1, q1, r12 -; CHECK-NEXT: vldrw.u32 q4, [q3, #24] +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vmla.i32 q3, q2, lr ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vstrw.32 q1, [r3] -; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vldrw.u32 q4, [q3, #24] +; CHECK-NEXT: vmul.i32 q2, q2, r12 +; CHECK-NEXT: vadd.i32 q0, q0, r4 +; CHECK-NEXT: vstrw.32 q2, [r3] ; CHECK-NEXT: vstrb.8 q4, [r1], #16 ; CHECK-NEXT: bne .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %end @@ -390,22 +390,22 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: adr r4, .LCPI8_0 ; CHECK-NEXT: movs r5, #18 -; CHECK-NEXT: vldrw.u32 q2, [r4] +; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: mov.w r12, #9 ; CHECK-NEXT: mov.w lr, #12 ; CHECK-NEXT: movs r4, #8 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vdup.32 q1, r5 +; CHECK-NEXT: vdup.32 q1, r0 +; CHECK-NEXT: vdup.32 q2, r5 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vadd.i32 q3, q2, r4 -; CHECK-NEXT: vmla.i32 q4, q2, lr +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmla.i32 q4, q3, lr ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vldrw.u32 q5, [q4, #24] -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmla.i32 q4, q2, r12 -; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmla.i32 q4, q3, r12 +; CHECK-NEXT: vadd.i32 q0, q0, r4 ; CHECK-NEXT: vstrb.8 q5, [r1], #16 ; CHECK-NEXT: vstrw.32 q4, [r3] ; CHECK-NEXT: bne .LBB8_1 @@ -487,21 +487,21 @@ ; CHECK-NEXT: @ => This Loop Header: Depth=2 ; CHECK-NEXT: @ Child Loop BB9_3 Depth 3 ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov q7, q2 +; CHECK-NEXT: vmov q1, q2 ; CHECK-NEXT: dls lr, r10 ; CHECK-NEXT: vmov.i32 q5, #0x0 -; CHECK-NEXT: vmlas.i32 q7, q0, r7 -; CHECK-NEXT: vmov q6, q4 +; CHECK-NEXT: vmlas.i32 q1, q0, r7 +; CHECK-NEXT: vmov q7, q4 ; CHECK-NEXT: .LBB9_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1 ; CHECK-NEXT: @ Parent Loop BB9_2 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vadd.i32 q0, q7, q3 -; CHECK-NEXT: vldrw.u32 q1, [r1, q7, uxtw #2] -; CHECK-NEXT: vldrw.u32 q7, [q6, #32]! -; CHECK-NEXT: vmul.i32 q1, q1, q7 -; CHECK-NEXT: vmov q7, q0 -; CHECK-NEXT: vadd.i32 q5, q1, q5 +; CHECK-NEXT: vmov q6, q1 +; CHECK-NEXT: vadd.i32 q1, q1, q3 +; CHECK-NEXT: vldrw.u32 q0, [r1, q6, uxtw #2] +; CHECK-NEXT: vldrw.u32 q6, [q7, #32]! +; CHECK-NEXT: vmul.i32 q0, q0, q6 +; CHECK-NEXT: vadd.i32 q5, q0, q5 ; CHECK-NEXT: le lr, .LBB9_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=2 @@ -702,12 +702,12 @@ ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 ; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vadd.i32 q6, q5, q3 -; CHECK-NEXT: vldrh.s32 q7, [r1, q5, uxtw #1] -; CHECK-NEXT: vldrh.s32 q5, [r3], #8 -; CHECK-NEXT: vmul.i32 q5, q7, q5 -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov q5, q6 +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vadd.i32 q5, q5, q3 +; CHECK-NEXT: vldrh.s32 q7, [r1, q6, uxtw #1] +; CHECK-NEXT: vldrh.s32 q6, [r3], #8 +; CHECK-NEXT: vmul.i32 q6, q7, q6 +; CHECK-NEXT: vadd.i32 q4, q6, q4 ; CHECK-NEXT: le lr, .LBB10_11 ; CHECK-NEXT: @ %bb.12: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 @@ -922,15 +922,15 @@ ; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3 ; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=5 -; CHECK-NEXT: vldrb.s32 q2, [r0, q5] -; CHECK-NEXT: vadd.i32 q7, q5, q0 -; CHECK-NEXT: vldrb.s32 q5, [r1, q4] -; CHECK-NEXT: vadd.i32 q6, q4, q0 -; CHECK-NEXT: vadd.i32 q2, q2, r2 +; CHECK-NEXT: vmov q7, q5 +; CHECK-NEXT: vmov q6, q4 +; CHECK-NEXT: vldrb.s32 q2, [r0, q7] +; CHECK-NEXT: vldrb.s32 q7, [r1, q6] ; CHECK-NEXT: subs r5, #4 -; CHECK-NEXT: vmlava.u32 r12, q2, q5 -; CHECK-NEXT: vmov q5, q7 -; CHECK-NEXT: vmov q4, q6 +; CHECK-NEXT: vadd.i32 q4, q4, q0 +; CHECK-NEXT: vadd.i32 q2, q2, r2 +; CHECK-NEXT: vadd.i32 q5, q5, q0 +; CHECK-NEXT: vmlava.u32 r12, q2, q7 ; CHECK-NEXT: bne .LBB11_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB11_4 Depth=4
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll index 828f8e4..652d25a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
@@ -105,68 +105,66 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: .pad #12 +; CHECK-NEXT: sub sp, #12 ; CHECK-NEXT: cmp r3, #1 -; CHECK-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-NEXT: stm.w sp, {r0, r1, r3} @ 12-byte Folded Spill ; CHECK-NEXT: blt .LBB4_12 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: ldr r7, [sp, #44] -; CHECK-NEXT: add.w r10, r2, #3 +; CHECK-NEXT: ldr r1, [sp, #48] +; CHECK-NEXT: add.w r12, r2, #3 ; CHECK-NEXT: ldr.w r11, [sp] @ 4-byte Reload -; CHECK-NEXT: mov r9, r2 -; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: mov r8, r2 ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: uxth.w r12, r7 -; CHECK-NEXT: mov.w r8, #0 -; CHECK-NEXT: str.w r9, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: uxth r3, r1 ; CHECK-NEXT: b .LBB4_4 ; CHECK-NEXT: .LBB4_2: @ in Loop: Header=BB4_4 Depth=1 ; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: .LBB4_3: @ %for.end ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 -; CHECK-NEXT: lsrs r0, r6, #16 -; CHECK-NEXT: sub.w r10, r10, #1 -; CHECK-NEXT: strh.w r0, [r5, r8, lsl #1] -; CHECK-NEXT: add.w r8, r8, #1 +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: lsrs r2, r6, #16 +; CHECK-NEXT: sub.w r12, r12, #1 ; CHECK-NEXT: add.w r11, r11, #2 -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: sub.w r9, r9, #1 -; CHECK-NEXT: cmp r8, r3 +; CHECK-NEXT: sub.w r8, r8, #1 +; CHECK-NEXT: strh.w r2, [r7, r10, lsl #1] +; CHECK-NEXT: add.w r10, r10, #1 +; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: cmp r10, r2 +; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: beq .LBB4_12 ; CHECK-NEXT: .LBB4_4: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB4_8 Depth 2 ; CHECK-NEXT: @ Child Loop BB4_11 Depth 2 -; CHECK-NEXT: cmp r0, r8 +; CHECK-NEXT: cmp r2, r10 ; CHECK-NEXT: ble .LBB4_2 ; CHECK-NEXT: @ %bb.5: @ %vector.main.loop.iter.check ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 -; CHECK-NEXT: sub.w r0, r0, r8 -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: cmp r0, #8 +; CHECK-NEXT: sub.w r4, r2, r10 +; CHECK-NEXT: cmp r4, #8 ; CHECK-NEXT: bhs .LBB4_7 ; CHECK-NEXT: @ %bb.6: @ in Loop: Header=BB4_4 Depth=1 ; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: mov.w r9, #0 ; CHECK-NEXT: b .LBB4_10 ; CHECK-NEXT: .LBB4_7: @ %vector.ph ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 -; CHECK-NEXT: bic r7, r9, #7 -; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: subs r7, #8 -; CHECK-NEXT: bic r1, r0, #7 -; CHECK-NEXT: mov r5, r11 -; CHECK-NEXT: add.w lr, r6, r7, lsr #3 +; CHECK-NEXT: bic r2, r8, #7 +; CHECK-NEXT: movs r7, #1 +; CHECK-NEXT: subs r2, #8 +; CHECK-NEXT: bic r9, r4, #7 ; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: ldr r4, [sp] @ 4-byte Reload +; CHECK-NEXT: mov r5, r11 +; CHECK-NEXT: add.w lr, r7, r2, lsr #3 +; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload ; CHECK-NEXT: .LBB4_8: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrh.u16 q0, [r4], #16 +; CHECK-NEXT: vldrh.u16 q0, [r2], #16 ; CHECK-NEXT: vldrh.u16 q1, [r5], #16 -; CHECK-NEXT: rsb.w r7, r12, #0 +; CHECK-NEXT: rsbs r7, r3, #0 ; CHECK-NEXT: vmullb.s16 q2, q1, q0 ; CHECK-NEXT: vmullt.s16 q0, q1, q0 ; CHECK-NEXT: vshl.s32 q2, r7 @@ -176,32 +174,29 @@ ; CHECK-NEXT: le lr, .LBB4_8 ; CHECK-NEXT: @ %bb.9: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 -; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: cmp r4, r9 ; CHECK-NEXT: beq .LBB4_3 ; CHECK-NEXT: .LBB4_10: @ %vec.epilog.ph ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 -; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload -; CHECK-NEXT: add.w r4, r1, r8 -; CHECK-NEXT: sub.w r7, r9, r1 -; CHECK-NEXT: add.w r2, r0, r1, lsl #1 -; CHECK-NEXT: add.w r4, r0, r4, lsl #1 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: dlstp.32 lr, r7 +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload +; CHECK-NEXT: add.w r2, r9, r10 +; CHECK-NEXT: add.w r7, r1, r9, lsl #1 +; CHECK-NEXT: add.w r2, r1, r2, lsl #1 +; CHECK-NEXT: sub.w r5, r8, r9 +; CHECK-NEXT: dlstp.32 lr, r5 ; CHECK-NEXT: .LBB4_11: @ %vec.epilog.vector.body ; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: rsb.w r0, r12, #0 -; CHECK-NEXT: vldrh.s32 q0, [r2], #8 -; CHECK-NEXT: vldrh.s32 q1, [r4], #8 +; CHECK-NEXT: rsbs r4, r3, #0 +; CHECK-NEXT: vldrh.s32 q0, [r7], #8 +; CHECK-NEXT: vldrh.s32 q1, [r2], #8 ; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vshl.s32 q0, r0 +; CHECK-NEXT: vshl.s32 q0, r4 ; CHECK-NEXT: vaddva.u32 r6, q0 ; CHECK-NEXT: letp lr, .LBB4_11 ; CHECK-NEXT: b .LBB4_3 ; CHECK-NEXT: .LBB4_12: @ %for.end17 -; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: add sp, #12 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %conv = sext i16 %Ls to i32
diff --git a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll index 43ed5ee..d6c5cde 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
@@ -18,50 +18,50 @@ ; CHECK-NEXT: csel r7, r6, r5, hs ; CHECK-NEXT: add.w lr, r7, #1 ; CHECK-NEXT: mov r4, r5 -; CHECK-NEXT: vldrh.u16 q0, [r0], #32 +; CHECK-NEXT: vldrh.u16 q1, [r0], #32 ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: mov r8, r5 +; CHECK-NEXT: vldrh.u16 q2, [r1], #32 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q2 +; CHECK-NEXT: vldrh.u16 q0, [r0, #-16] +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q2 +; CHECK-NEXT: vldrh.u16 q2, [r1, #-16] +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q2 ; CHECK-NEXT: vldrh.u16 q1, [r1], #32 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1 -; CHECK-NEXT: vldrh.u16 q3, [r1, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3 -; CHECK-NEXT: vldrh.u16 q0, [r1], #32 ; CHECK-NEXT: sub.w lr, lr, #1 ; CHECK-NEXT: cmp.w lr, #0 -; CHECK-NEXT: vldrh.u16 q1, [r0], #32 +; CHECK-NEXT: vldrh.u16 q3, [r0], #32 ; CHECK-NEXT: beq .LBB0_3 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB0_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3 -; CHECK-NEXT: vldrh.u16 q3, [r1, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0 -; CHECK-NEXT: vldrh.u16 q1, [r0], #32 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3 -; CHECK-NEXT: vldrh.u16 q0, [r1], #32 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q2 +; CHECK-NEXT: vldrh.u16 q2, [r1, #-16] +; CHECK-NEXT: vmlsldava.s16 r4, r7, q3, q1 +; CHECK-NEXT: vldrh.u16 q0, [r0, #-16] +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q3, q1 +; CHECK-NEXT: vldrh.u16 q3, [r0], #32 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q2 +; CHECK-NEXT: vldrh.u16 q1, [r1], #32 ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q2 ; CHECK-NEXT: movs r6, #14 ; CHECK-NEXT: and.w r2, r6, r2, lsl #1 -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r1, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q0 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q3, q1 +; CHECK-NEXT: vldrh.u16 q0, [r0, #-16] +; CHECK-NEXT: vmlsldava.s16 r4, r7, q3, q1 +; CHECK-NEXT: vldrh.u16 q1, [r1, #-16] +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q0 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrht.u16 q1, [r0] +; CHECK-NEXT: vldrht.u16 q2, [r0] ; CHECK-NEXT: cmp r2, #9 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vldrht.u16 q0, [r1] -; CHECK-NEXT: vmlsldavat.s16 r4, r7, q1, q0 -; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q1, q0 +; CHECK-NEXT: vmlsldavat.s16 r4, r7, q2, q0 +; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q2, q0 ; CHECK-NEXT: blo .LBB0_10 ; CHECK-NEXT: @ %bb.4: @ %do.body.1 ; CHECK-NEXT: subs r2, #8
diff --git a/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll b/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll index 75612ba..9e4faa96 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll
@@ -15,16 +15,15 @@ ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: loop # label0: ; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.tee 3 ; CHECK-NEXT: i32.const 16 ; CHECK-NEXT: i32.add -; CHECK-NEXT: local.tee 3 -; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.tee 0 +; CHECK-NEXT: local.get 3 ; CHECK-NEXT: v128.load 0:p2align=0 ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: i8x16.shl ; CHECK-NEXT: v128.store 0 -; CHECK-NEXT: local.get 3 -; CHECK-NEXT: local.set 0 ; CHECK-NEXT: local.get 2 ; CHECK-NEXT: i32.const -1 ; CHECK-NEXT: i32.add @@ -64,10 +63,11 @@ ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: loop # label1: ; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.tee 3 ; CHECK-NEXT: i32.const 16 ; CHECK-NEXT: i32.add -; CHECK-NEXT: local.tee 3 -; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.tee 0 +; CHECK-NEXT: local.get 3 ; CHECK-NEXT: v128.load 0:p2align=0 ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: i8x16.shl @@ -76,8 +76,6 @@ ; CHECK-NEXT: i32.const 1 ; CHECK-NEXT: i32.and ; CHECK-NEXT: local.set 1 -; CHECK-NEXT: local.get 3 -; CHECK-NEXT: local.set 0 ; CHECK-NEXT: local.get 2 ; CHECK-NEXT: i32.const -1 ; CHECK-NEXT: i32.add
diff --git a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll index 8a8e7a3..06cf968 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
@@ -297,30 +297,30 @@ ; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: movl $buf, %edx -; CHECK-NEXT: movl $32, %esi +; CHECK-NEXT: movl $buf, %ecx +; CHECK-NEXT: movl $32, %edx +; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: jmp .LBB5_1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB5_3: # %if.false ; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1 -; CHECK-NEXT: decl %eax +; CHECK-NEXT: decl %esi ; CHECK-NEXT: .LBB5_4: # %loop.bb2 ; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1 -; CHECK-NEXT: leal (%rdi,%rax), %r8d +; CHECK-NEXT: leal (%rdi,%rsi), %r8d ; CHECK-NEXT: movw %r8w, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: cmpw $7, %ax +; CHECK-NEXT: cmpw $7, %si ; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) ; CHECK-NEXT: tilezero %tmm0 -; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi) +; CHECK-NEXT: tilestored %tmm0, (%rcx,%rdx) ; CHECK-NEXT: jne .LBB5_5 ; CHECK-NEXT: .LBB5_1: # %loop.bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB5_3 ; CHECK-NEXT: # %bb.2: # %if.true ; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1 -; CHECK-NEXT: incl %eax +; CHECK-NEXT: incl %esi ; CHECK-NEXT: jmp .LBB5_4 ; CHECK-NEXT: .LBB5_5: # %exit ; CHECK-NEXT: tilerelease
diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll index cffd88c..477a0dc 100644 --- a/llvm/test/CodeGen/X86/i128-mul.ll +++ b/llvm/test/CodeGen/X86/i128-mul.ll
@@ -111,62 +111,63 @@ ; X86-NOBMI-NEXT: orl %ecx, %eax ; X86-NOBMI-NEXT: je .LBB1_3 ; X86-NOBMI-NEXT: # %bb.1: # %for.body.preheader -; X86-NOBMI-NEXT: xorl %eax, %eax -; X86-NOBMI-NEXT: xorl %edx, %edx +; X86-NOBMI-NEXT: xorl %esi, %esi ; X86-NOBMI-NEXT: xorl %ecx, %ecx -; X86-NOBMI-NEXT: movl $0, (%esp) # 4-byte Folded Spill +; X86-NOBMI-NEXT: xorl %edi, %edi +; X86-NOBMI-NEXT: xorl %ebp, %ebp ; X86-NOBMI-NEXT: .p2align 4 ; X86-NOBMI-NEXT: .LBB1_2: # %for.body ; X86-NOBMI-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NOBMI-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl (%eax,%edi,8), %ebp +; X86-NOBMI-NEXT: movl 4(%eax,%edi,8), %ebx +; X86-NOBMI-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOBMI-NEXT: movl %ebp, %eax +; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp) ; X86-NOBMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: movl (%eax,%ecx,8), %edi -; X86-NOBMI-NEXT: movl 4(%eax,%ecx,8), %ebx -; X86-NOBMI-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NOBMI-NEXT: movl %edi, %eax -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: mull %esi -; X86-NOBMI-NEXT: movl %edx, %ebp -; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl %ebx, %eax -; X86-NOBMI-NEXT: mull %esi -; X86-NOBMI-NEXT: movl %edx, %ebx -; X86-NOBMI-NEXT: movl %eax, %esi -; X86-NOBMI-NEXT: addl %ebp, %esi -; X86-NOBMI-NEXT: adcl $0, %ebx -; X86-NOBMI-NEXT: movl %edi, %eax +; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp) +; X86-NOBMI-NEXT: movl %eax, %ebx +; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NOBMI-NEXT: adcl $0, %edx +; X86-NOBMI-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-NOBMI-NEXT: movl %ebp, %eax ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: mull %edx -; X86-NOBMI-NEXT: movl %edx, %ebp -; X86-NOBMI-NEXT: movl %eax, %edi -; X86-NOBMI-NEXT: addl %esi, %edi -; X86-NOBMI-NEXT: adcl %ebx, %ebp -; X86-NOBMI-NEXT: setb %bl +; X86-NOBMI-NEXT: movl %eax, %ebp +; X86-NOBMI-NEXT: addl %ebx, %ebp +; X86-NOBMI-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOBMI-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X86-NOBMI-NEXT: movl %edx, %ebx +; X86-NOBMI-NEXT: setb (%esp) # 1-byte Folded Spill ; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOBMI-NEXT: addl %ebp, %eax -; X86-NOBMI-NEXT: movzbl %bl, %esi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NOBMI-NEXT: adcl %esi, %edx -; X86-NOBMI-NEXT: movl %ecx, %ebx -; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NOBMI-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NOBMI-NEXT: adcl $0, %eax -; X86-NOBMI-NEXT: adcl $0, %edx -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: movl %ecx, (%esi,%ebx,8) -; X86-NOBMI-NEXT: movl %ebx, %ecx -; X86-NOBMI-NEXT: movl %edi, 4(%esi,%ebx,8) -; X86-NOBMI-NEXT: addl $1, %ecx -; X86-NOBMI-NEXT: movl (%esp), %edi # 4-byte Reload -; X86-NOBMI-NEXT: adcl $0, %edi -; X86-NOBMI-NEXT: movl %ecx, %esi -; X86-NOBMI-NEXT: xorl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NOBMI-NEXT: xorl %ebp, %edi -; X86-NOBMI-NEXT: orl %esi, %edi +; X86-NOBMI-NEXT: movl %eax, %esi +; X86-NOBMI-NEXT: addl %ebx, %esi +; X86-NOBMI-NEXT: movl %ecx, %eax +; X86-NOBMI-NEXT: movzbl (%esp), %ebx # 1-byte Folded Reload +; X86-NOBMI-NEXT: movl %edx, %ecx +; X86-NOBMI-NEXT: adcl %ebx, %ecx +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NOBMI-NEXT: adcl %eax, %ebp +; X86-NOBMI-NEXT: adcl $0, %esi +; X86-NOBMI-NEXT: adcl $0, %ecx +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl %edx, (%eax,%edi,8) +; X86-NOBMI-NEXT: movl %ebp, 4(%eax,%edi,8) +; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NOBMI-NEXT: addl $1, %edi +; X86-NOBMI-NEXT: adcl $0, %ebp +; X86-NOBMI-NEXT: movl %edi, %eax +; X86-NOBMI-NEXT: xorl %edx, %eax +; X86-NOBMI-NEXT: movl %ebp, %edx +; X86-NOBMI-NEXT: xorl %ebx, %edx +; X86-NOBMI-NEXT: orl %eax, %edx ; X86-NOBMI-NEXT: jne .LBB1_2 ; X86-NOBMI-NEXT: .LBB1_3: # %for.end ; X86-NOBMI-NEXT: xorl %eax, %eax @@ -184,71 +185,66 @@ ; X86-BMI-NEXT: pushl %ebx ; X86-BMI-NEXT: pushl %edi ; X86-BMI-NEXT: pushl %esi -; X86-BMI-NEXT: subl $20, %esp +; X86-BMI-NEXT: subl $16, %esp ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI-NEXT: orl %ecx, %eax ; X86-BMI-NEXT: je .LBB1_3 ; X86-BMI-NEXT: # %bb.1: # %for.body.preheader -; X86-BMI-NEXT: xorl %ecx, %ecx -; X86-BMI-NEXT: xorl %eax, %eax +; X86-BMI-NEXT: xorl %esi, %esi +; X86-BMI-NEXT: xorl %edi, %edi ; X86-BMI-NEXT: xorl %ebx, %ebx -; X86-BMI-NEXT: xorl %ebp, %ebp +; X86-BMI-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-BMI-NEXT: .p2align 4 ; X86-BMI-NEXT: .LBB1_2: # %for.body ; X86-BMI-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-BMI-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-BMI-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-BMI-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI-NEXT: movl (%eax,%ebx,8), %ecx -; X86-BMI-NEXT: movl 4(%eax,%ebx,8), %esi -; X86-BMI-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-BMI-NEXT: movl 4(%eax,%ebx,8), %ebp +; X86-BMI-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-BMI-NEXT: movl %ecx, %edx -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI-NEXT: mulxl %eax, %edx, %edi +; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %edx, %eax +; X86-BMI-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-BMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-BMI-NEXT: movl %esi, %edx -; X86-BMI-NEXT: mulxl %eax, %esi, %eax -; X86-BMI-NEXT: addl %edi, %esi -; X86-BMI-NEXT: adcl $0, %eax -; X86-BMI-NEXT: movl %ecx, %edx -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BMI-NEXT: mulxl %ecx, %edi, %ebp -; X86-BMI-NEXT: addl %esi, %edi -; X86-BMI-NEXT: adcl %eax, %ebp -; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-BMI-NEXT: mulxl %ecx, %ecx, %eax -; X86-BMI-NEXT: setb %dl -; X86-BMI-NEXT: addl %ebp, %ecx -; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI-NEXT: movzbl %dl, %edx -; X86-BMI-NEXT: adcl %edx, %eax -; X86-BMI-NEXT: movl %eax, %edx -; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-BMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-BMI-NEXT: adcl (%esp), %edi # 4-byte Folded Reload -; X86-BMI-NEXT: adcl $0, %ecx -; X86-BMI-NEXT: adcl $0, %edx -; X86-BMI-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-BMI-NEXT: movl %eax, (%edx,%ebx,8) -; X86-BMI-NEXT: movl %edi, 4(%edx,%ebx,8) -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-BMI-NEXT: addl $1, %ebx +; X86-BMI-NEXT: movl %ebp, %edx +; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %eax, %ebp +; X86-BMI-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; X86-BMI-NEXT: adcl $0, %ebp -; X86-BMI-NEXT: movl %ebx, %edx -; X86-BMI-NEXT: xorl %esi, %edx -; X86-BMI-NEXT: movl %ebp, %esi -; X86-BMI-NEXT: xorl %edi, %esi -; X86-BMI-NEXT: orl %edx, %esi -; X86-BMI-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-BMI-NEXT: movl %ecx, %edx +; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %ecx, %edx +; X86-BMI-NEXT: addl %eax, %ecx +; X86-BMI-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-BMI-NEXT: movl %esi, %eax +; X86-BMI-NEXT: adcl %ebp, %edx +; X86-BMI-NEXT: movl %edx, %ebp +; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %esi, %edi +; X86-BMI-NEXT: setb %dl +; X86-BMI-NEXT: addl %ebp, %esi +; X86-BMI-NEXT: movzbl %dl, %edx +; X86-BMI-NEXT: adcl %edx, %edi +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-BMI-NEXT: addl %eax, %edx +; X86-BMI-NEXT: adcl (%esp), %ecx # 4-byte Folded Reload +; X86-BMI-NEXT: adcl $0, %esi +; X86-BMI-NEXT: adcl $0, %edi +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI-NEXT: movl %edx, (%eax,%ebx,8) +; X86-BMI-NEXT: movl %ecx, 4(%eax,%ebx,8) +; X86-BMI-NEXT: addl $1, %ebx +; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-BMI-NEXT: adcl $0, %ecx +; X86-BMI-NEXT: movl %ebx, %eax +; X86-BMI-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-BMI-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-BMI-NEXT: xorl %ebp, %ecx +; X86-BMI-NEXT: orl %eax, %ecx ; X86-BMI-NEXT: jne .LBB1_2 ; X86-BMI-NEXT: .LBB1_3: # %for.end ; X86-BMI-NEXT: xorl %eax, %eax ; X86-BMI-NEXT: xorl %edx, %edx -; X86-BMI-NEXT: addl $20, %esp +; X86-BMI-NEXT: addl $16, %esp ; X86-BMI-NEXT: popl %esi ; X86-BMI-NEXT: popl %edi ; X86-BMI-NEXT: popl %ebx @@ -261,11 +257,12 @@ ; X64-NOBMI-NEXT: je .LBB1_3 ; X64-NOBMI-NEXT: # %bb.1: # %for.body.preheader ; X64-NOBMI-NEXT: movq %rdx, %r8 -; X64-NOBMI-NEXT: xorl %r10d, %r10d +; X64-NOBMI-NEXT: xorl %edx, %edx ; X64-NOBMI-NEXT: xorl %r9d, %r9d ; X64-NOBMI-NEXT: .p2align 4 ; X64-NOBMI-NEXT: .LBB1_2: # %for.body ; X64-NOBMI-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-NOBMI-NEXT: movq %rdx, %r10 ; X64-NOBMI-NEXT: movq %rcx, %rax ; X64-NOBMI-NEXT: mulq (%r8,%r9,8) ; X64-NOBMI-NEXT: addq %r10, %rax @@ -273,7 +270,6 @@ ; X64-NOBMI-NEXT: movq %rax, (%rsi,%r9,8) ; X64-NOBMI-NEXT: incq %r9 ; X64-NOBMI-NEXT: cmpq %r9, %rdi -; X64-NOBMI-NEXT: movq %rdx, %r10 ; X64-NOBMI-NEXT: jne .LBB1_2 ; X64-NOBMI-NEXT: .LBB1_3: # %for.end ; X64-NOBMI-NEXT: xorl %eax, %eax @@ -285,11 +281,12 @@ ; X64-BMI-NEXT: je .LBB1_3 ; X64-BMI-NEXT: # %bb.1: # %for.body.preheader ; X64-BMI-NEXT: movq %rdx, %rax -; X64-BMI-NEXT: xorl %r9d, %r9d +; X64-BMI-NEXT: xorl %edx, %edx ; X64-BMI-NEXT: xorl %r8d, %r8d ; X64-BMI-NEXT: .p2align 4 ; X64-BMI-NEXT: .LBB1_2: # %for.body ; X64-BMI-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-BMI-NEXT: movq %rdx, %r9 ; X64-BMI-NEXT: movq %rcx, %rdx ; X64-BMI-NEXT: mulxq (%rax,%r8,8), %r10, %rdx ; X64-BMI-NEXT: addq %r9, %r10 @@ -297,7 +294,6 @@ ; X64-BMI-NEXT: movq %r10, (%rsi,%r8,8) ; X64-BMI-NEXT: incq %r8 ; X64-BMI-NEXT: cmpq %r8, %rdi -; X64-BMI-NEXT: movq %rdx, %r9 ; X64-BMI-NEXT: jne .LBB1_2 ; X64-BMI-NEXT: .LBB1_3: # %for.end ; X64-BMI-NEXT: xorl %eax, %eax
diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce5.ll b/llvm/test/CodeGen/X86/loop-strength-reduce5.ll index 0800373..ebae51f 100644 --- a/llvm/test/CodeGen/X86/loop-strength-reduce5.ll +++ b/llvm/test/CodeGen/X86/loop-strength-reduce5.ll
@@ -16,11 +16,11 @@ ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: # %bb ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movw %cx, X -; CHECK-NEXT: movw %dx, Y -; CHECK-NEXT: incl %ecx -; CHECK-NEXT: addl $4, %edx -; CHECK-NEXT: cmpl %ecx, %eax +; CHECK-NEXT: movw %dx, X +; CHECK-NEXT: movw %cx, Y +; CHECK-NEXT: incl %edx +; CHECK-NEXT: addl $4, %ecx +; CHECK-NEXT: cmpl %edx, %eax ; CHECK-NEXT: jne .LBB0_2 ; CHECK-NEXT: .LBB0_3: # %return ; CHECK-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll index 209ee79..2a2a4a5 100644 --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll
@@ -1480,15 +1480,15 @@ ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmulld %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmulld %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: addq $16, %rcx ; AVX2-NEXT: cmpq %rcx, %rax ; AVX2-NEXT: jne .LBB10_1 ; AVX2-NEXT: # %bb.2: # %middle.block -; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1728,10 +1728,10 @@ ; AVX2-NEXT: vpaddd %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmulld %ymm5, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmulld %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmulld %ymm7, %ymm4, %ymm4 ; AVX2-NEXT: vpaddd %ymm3, %ymm4, %ymm3 @@ -1739,9 +1739,9 @@ ; AVX2-NEXT: cmpq %rcx, %rax ; AVX2-NEXT: jne .LBB11_1 ; AVX2-NEXT: # %bb.2: # %middle.block -; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1765,15 +1765,15 @@ ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512-NEXT: vpmulld %zmm2, %zmm4, %zmm2 -; AVX512-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512-NEXT: vpmulld %zmm3, %zmm2, %zmm2 -; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; AVX512-NEXT: addq $16, %rcx ; AVX512-NEXT: cmpq %rcx, %rax ; AVX512-NEXT: jne .LBB11_1 ; AVX512-NEXT: # %bb.2: # %middle.block -; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/pr49451.ll b/llvm/test/CodeGen/X86/pr49451.ll index 1a7551f..173c411 100644 --- a/llvm/test/CodeGen/X86/pr49451.ll +++ b/llvm/test/CodeGen/X86/pr49451.ll
@@ -18,15 +18,15 @@ ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB0_1: # %for.body612 ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: testb %bl, %bl +; X86-NEXT: testb %dl, %dl ; X86-NEXT: je .LBB0_2 ; X86-NEXT: # %bb.3: # %if.end1401 ; X86-NEXT: # in Loop: Header=BB0_1 Depth=1 ; X86-NEXT: addl %eax, %esi ; X86-NEXT: movw %si, s_2 -; X86-NEXT: movw %dx, s_0 +; X86-NEXT: movw %bx, s_0 ; X86-NEXT: incl %ecx -; X86-NEXT: incl %edx +; X86-NEXT: incl %ebx ; X86-NEXT: cmpw $73, %cx ; X86-NEXT: jl .LBB0_1 ; X86-NEXT: # %bb.4: # %for.body1703
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll index c096223..1c3d27f 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -12729,43 +12729,43 @@ ; FALLBACK9-NEXT: pushq %rbx ; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK9-NEXT: movl (%rsi), %eax +; FALLBACK9-NEXT: movl (%rsi), %edi ; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: leal (,%rax,8), %ecx +; FALLBACK9-NEXT: leal (,%rdi,8), %ecx ; FALLBACK9-NEXT: andl $56, %ecx -; FALLBACK9-NEXT: andl $56, %eax -; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq %r9, %rsi -; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK9-NEXT: andl $56, %edi +; FALLBACK9-NEXT: movq -96(%rsp,%rdi), %rsi +; FALLBACK9-NEXT: movq -104(%rsp,%rdi), %r9 +; FALLBACK9-NEXT: movq %r9, %rax +; FALLBACK9-NEXT: shrdq %cl, %rsi, %rax +; FALLBACK9-NEXT: movq -112(%rsp,%rdi), %r10 ; FALLBACK9-NEXT: movq %r10, %r8 ; FALLBACK9-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK9-NEXT: movq -80(%rsp,%rdi), %r9 +; FALLBACK9-NEXT: movq -88(%rsp,%rdi), %r11 ; FALLBACK9-NEXT: movq %r11, %rbx ; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK9-NEXT: shrdq %cl, %r11, %rsi +; FALLBACK9-NEXT: movq -72(%rsp,%rdi), %r11 ; FALLBACK9-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK9-NEXT: movq %rax, %r15 +; FALLBACK9-NEXT: movq -128(%rsp,%rdi), %r14 +; FALLBACK9-NEXT: movq -120(%rsp,%rdi), %rdi +; FALLBACK9-NEXT: movq %rdi, %r15 ; FALLBACK9-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK9-NEXT: shrdq %cl, %rdi, %r14 ; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK9-NEXT: shrq %cl, %r11 ; FALLBACK9-NEXT: movq %r15, 8(%rdx) ; FALLBACK9-NEXT: movq %r9, 48(%rdx) ; FALLBACK9-NEXT: movq %r11, 56(%rdx) -; FALLBACK9-NEXT: movq %rdi, 32(%rdx) +; FALLBACK9-NEXT: movq %rsi, 32(%rdx) ; FALLBACK9-NEXT: movq %rbx, 40(%rdx) ; FALLBACK9-NEXT: movq %r8, 16(%rdx) -; FALLBACK9-NEXT: movq %rsi, 24(%rdx) +; FALLBACK9-NEXT: movq %rax, 24(%rdx) ; FALLBACK9-NEXT: movq %r14, (%rdx) ; FALLBACK9-NEXT: popq %rbx ; FALLBACK9-NEXT: popq %r14 @@ -12906,45 +12906,45 @@ ; FALLBACK12-NEXT: pushq %rbx ; FALLBACK12-NEXT: pushq %rax ; FALLBACK12-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK12-NEXT: movl (%rsi), %r9d +; FALLBACK12-NEXT: movl (%rsi), %r10d ; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: leal (,%r9,8), %eax +; FALLBACK12-NEXT: leal (,%r10,8), %eax ; FALLBACK12-NEXT: andl $56, %eax -; FALLBACK12-NEXT: andl $56, %r9d -; FALLBACK12-NEXT: movq -128(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq -120(%rsp,%r9), %r8 +; FALLBACK12-NEXT: andl $56, %r10d +; FALLBACK12-NEXT: movq -128(%rsp,%r10), %r9 +; FALLBACK12-NEXT: movq -120(%rsp,%r10), %r8 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: shrq %cl, %r9 ; FALLBACK12-NEXT: movl %eax, %esi ; FALLBACK12-NEXT: notb %sil ; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rdi -; FALLBACK12-NEXT: orq %r10, %rdi -; FALLBACK12-NEXT: movq -104(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq %r10, %rbx +; FALLBACK12-NEXT: orq %r9, %rdi +; FALLBACK12-NEXT: movq -104(%rsp,%r10), %r9 +; FALLBACK12-NEXT: movq %r9, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %rbx -; FALLBACK12-NEXT: movq -96(%rsp,%r9), %r12 +; FALLBACK12-NEXT: movq -96(%rsp,%r10), %r12 ; FALLBACK12-NEXT: leaq (%r12,%r12), %r11 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r11 ; FALLBACK12-NEXT: orq %rbx, %r11 -; FALLBACK12-NEXT: movq -112(%rsp,%r9), %rbx +; FALLBACK12-NEXT: movq -112(%rsp,%r10), %rbx ; FALLBACK12-NEXT: movq %rbx, %r14 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r14 -; FALLBACK12-NEXT: addq %r10, %r10 +; FALLBACK12-NEXT: addq %r9, %r9 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r14, %r10 -; FALLBACK12-NEXT: movq -88(%rsp,%r9), %r14 +; FALLBACK12-NEXT: shlq %cl, %r9 +; FALLBACK12-NEXT: orq %r14, %r9 +; FALLBACK12-NEXT: movq -88(%rsp,%r10), %r14 ; FALLBACK12-NEXT: movq %r14, %r13 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r13 -; FALLBACK12-NEXT: movq -80(%rsp,%r9), %rbp +; FALLBACK12-NEXT: movq -80(%rsp,%r10), %rbp ; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r15 @@ -12957,8 +12957,8 @@ ; FALLBACK12-NEXT: orq %r12, %r14 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %rbp -; FALLBACK12-NEXT: movq -72(%rsp,%r9), %r9 -; FALLBACK12-NEXT: leaq (%r9,%r9), %r12 +; FALLBACK12-NEXT: movq -72(%rsp,%r10), %r10 +; FALLBACK12-NEXT: leaq (%r10,%r10), %r12 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r12 ; FALLBACK12-NEXT: orq %rbp, %r12 @@ -12969,13 +12969,13 @@ ; FALLBACK12-NEXT: shlq %cl, %rbx ; FALLBACK12-NEXT: orq %r8, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r9 -; FALLBACK12-NEXT: movq %r9, 56(%rdx) +; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: movq %r10, 56(%rdx) ; FALLBACK12-NEXT: movq %rbx, 8(%rdx) ; FALLBACK12-NEXT: movq %r12, 48(%rdx) ; FALLBACK12-NEXT: movq %r14, 32(%rdx) ; FALLBACK12-NEXT: movq %r15, 40(%rdx) -; FALLBACK12-NEXT: movq %r10, 16(%rdx) +; FALLBACK12-NEXT: movq %r9, 16(%rdx) ; FALLBACK12-NEXT: movq %r11, 24(%rdx) ; FALLBACK12-NEXT: movq %rdi, (%rdx) ; FALLBACK12-NEXT: addq $8, %rsp @@ -13111,40 +13111,40 @@ ; FALLBACK15-NEXT: pushq %r14 ; FALLBACK15-NEXT: pushq %rbx ; FALLBACK15-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK15-NEXT: movl (%rsi), %eax +; FALLBACK15-NEXT: movl (%rsi), %edi ; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: leal (,%rax,8), %ecx +; FALLBACK15-NEXT: leal (,%rdi,8), %ecx ; FALLBACK15-NEXT: andl $56, %ecx -; FALLBACK15-NEXT: andl $56, %eax -; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq %r9, %rsi -; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK15-NEXT: andl $56, %edi +; FALLBACK15-NEXT: movq -96(%rsp,%rdi), %rsi +; FALLBACK15-NEXT: movq -104(%rsp,%rdi), %r9 +; FALLBACK15-NEXT: movq %r9, %rax +; FALLBACK15-NEXT: shrdq %cl, %rsi, %rax +; FALLBACK15-NEXT: movq -112(%rsp,%rdi), %r10 ; FALLBACK15-NEXT: movq %r10, %r8 ; FALLBACK15-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK15-NEXT: movq -80(%rsp,%rdi), %r9 +; FALLBACK15-NEXT: movq -88(%rsp,%rdi), %r11 ; FALLBACK15-NEXT: movq %r11, %rbx ; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK15-NEXT: shrdq %cl, %r11, %rsi +; FALLBACK15-NEXT: movq -72(%rsp,%rdi), %r11 ; FALLBACK15-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK15-NEXT: movq %rax, %r15 +; FALLBACK15-NEXT: movq -128(%rsp,%rdi), %r14 +; FALLBACK15-NEXT: movq -120(%rsp,%rdi), %rdi +; FALLBACK15-NEXT: movq %rdi, %r15 ; FALLBACK15-NEXT: shrdq %cl, %r10, %r15 ; FALLBACK15-NEXT: shrxq %rcx, %r11, %r10 ; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK15-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK15-NEXT: shrdq %cl, %rdi, %r14 ; FALLBACK15-NEXT: movq %r15, 8(%rdx) ; FALLBACK15-NEXT: movq %r9, 48(%rdx) -; FALLBACK15-NEXT: movq %rdi, 32(%rdx) +; FALLBACK15-NEXT: movq %rsi, 32(%rdx) ; FALLBACK15-NEXT: movq %rbx, 40(%rdx) ; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rsi, 24(%rdx) +; FALLBACK15-NEXT: movq %rax, 24(%rdx) ; FALLBACK15-NEXT: movq %r14, (%rdx) ; FALLBACK15-NEXT: movq %r10, 56(%rdx) ; FALLBACK15-NEXT: popq %rbx
diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll index 37620ec..9fbbba2 100644 --- a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll +++ b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -1185,10 +1185,10 @@ ; ENABLE-NEXT: .p2align 4 ; ENABLE-NEXT: LBB14_2: ## %for.body ; ENABLE-NEXT: ## =>This Inner Loop Header: Depth=1 -; ENABLE-NEXT: cmpl %esi, %edi -; ENABLE-NEXT: setl %al +; ENABLE-NEXT: movl %esi, %eax ; ENABLE-NEXT: xorl %esi, %esi -; ENABLE-NEXT: movb %al, %sil +; ENABLE-NEXT: cmpl %eax, %edi +; ENABLE-NEXT: setl %sil ; ENABLE-NEXT: incb %dl ; ENABLE-NEXT: cmpb $45, %dl ; ENABLE-NEXT: jl LBB14_2 @@ -1220,10 +1220,10 @@ ; DISABLE-NEXT: .p2align 4 ; DISABLE-NEXT: LBB14_2: ## %for.body ; DISABLE-NEXT: ## =>This Inner Loop Header: Depth=1 -; DISABLE-NEXT: cmpl %esi, %edi -; DISABLE-NEXT: setl %al +; DISABLE-NEXT: movl %esi, %eax ; DISABLE-NEXT: xorl %esi, %esi -; DISABLE-NEXT: movb %al, %sil +; DISABLE-NEXT: cmpl %eax, %edi +; DISABLE-NEXT: setl %sil ; DISABLE-NEXT: incb %dl ; DISABLE-NEXT: cmpb $45, %dl ; DISABLE-NEXT: jl LBB14_2
diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll index 2bef668..59fbf71 100644 --- a/llvm/test/CodeGen/X86/xor.ll +++ b/llvm/test/CodeGen/X86/xor.ll
@@ -62,12 +62,12 @@ ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB3_1: # %bb ; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: notl %edx -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: addl %edx, %edx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: notl %ecx +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: jne .LBB3_1 ; X86-NEXT: # %bb.2: # %bb12 ; X86-NEXT: retl @@ -78,12 +78,12 @@ ; X64-LIN-NEXT: .p2align 4 ; X64-LIN-NEXT: .LBB3_1: # %bb ; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-LIN-NEXT: movl %esi, %ecx ; X64-LIN-NEXT: xorl %esi, %eax -; X64-LIN-NEXT: movl %eax, %ecx -; X64-LIN-NEXT: notl %ecx -; X64-LIN-NEXT: andl %esi, %ecx -; X64-LIN-NEXT: addl %ecx, %ecx -; X64-LIN-NEXT: movl %ecx, %esi +; X64-LIN-NEXT: movl %eax, %esi +; X64-LIN-NEXT: notl %esi +; X64-LIN-NEXT: andl %ecx, %esi +; X64-LIN-NEXT: addl %esi, %esi ; X64-LIN-NEXT: jne .LBB3_1 ; X64-LIN-NEXT: # %bb.2: # %bb12 ; X64-LIN-NEXT: retq @@ -94,12 +94,12 @@ ; X64-WIN-NEXT: .p2align 4 ; X64-WIN-NEXT: .LBB3_1: # %bb ; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-WIN-NEXT: movl %edx, %ecx ; X64-WIN-NEXT: xorl %edx, %eax -; X64-WIN-NEXT: movl %eax, %ecx -; X64-WIN-NEXT: notl %ecx -; X64-WIN-NEXT: andl %edx, %ecx -; X64-WIN-NEXT: addl %ecx, %ecx -; X64-WIN-NEXT: movl %ecx, %edx +; X64-WIN-NEXT: movl %eax, %edx +; X64-WIN-NEXT: notl %edx +; X64-WIN-NEXT: andl %ecx, %edx +; X64-WIN-NEXT: addl %edx, %edx ; X64-WIN-NEXT: jne .LBB3_1 ; X64-WIN-NEXT: # %bb.2: # %bb12 ; X64-WIN-NEXT: retq @@ -126,13 +126,13 @@ ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB4_1: # %bb ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: notl %edx -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: addl %edx, %edx -; X86-NEXT: testw %dx, %dx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: xorl %edx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: notl %ecx +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: testw %cx, %cx ; X86-NEXT: jne .LBB4_1 ; X86-NEXT: # %bb.2: # %bb12 ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -144,13 +144,13 @@ ; X64-LIN-NEXT: .p2align 4 ; X64-LIN-NEXT: .LBB4_1: # %bb ; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-LIN-NEXT: xorl %esi, %eax -; X64-LIN-NEXT: movl %eax, %ecx -; X64-LIN-NEXT: notl %ecx -; X64-LIN-NEXT: andl %esi, %ecx -; X64-LIN-NEXT: addl %ecx, %ecx -; X64-LIN-NEXT: testw %cx, %cx -; X64-LIN-NEXT: movl %ecx, %esi +; X64-LIN-NEXT: movl %esi, %ecx +; X64-LIN-NEXT: xorl %ecx, %eax +; X64-LIN-NEXT: movl %eax, %esi +; X64-LIN-NEXT: notl %esi +; X64-LIN-NEXT: andl %ecx, %esi +; X64-LIN-NEXT: addl %esi, %esi +; X64-LIN-NEXT: testw %si, %si ; X64-LIN-NEXT: jne .LBB4_1 ; X64-LIN-NEXT: # %bb.2: # %bb12 ; X64-LIN-NEXT: # kill: def $ax killed $ax killed $eax @@ -163,13 +163,13 @@ ; X64-WIN-NEXT: .p2align 4 ; X64-WIN-NEXT: .LBB4_1: # %bb ; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-WIN-NEXT: xorl %edx, %eax -; X64-WIN-NEXT: movl %eax, %ecx -; X64-WIN-NEXT: notl %ecx -; X64-WIN-NEXT: andl %edx, %ecx -; X64-WIN-NEXT: addl %ecx, %ecx -; X64-WIN-NEXT: testw %cx, %cx -; X64-WIN-NEXT: movl %ecx, %edx +; X64-WIN-NEXT: movl %edx, %ecx +; X64-WIN-NEXT: xorl %ecx, %eax +; X64-WIN-NEXT: movl %eax, %edx +; X64-WIN-NEXT: notl %edx +; X64-WIN-NEXT: andl %ecx, %edx +; X64-WIN-NEXT: addl %edx, %edx +; X64-WIN-NEXT: testw %dx, %dx ; X64-WIN-NEXT: jne .LBB4_1 ; X64-WIN-NEXT: # %bb.2: # %bb12 ; X64-WIN-NEXT: # kill: def $ax killed $ax killed $eax @@ -197,12 +197,12 @@ ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB5_1: # %bb ; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: xorb %cl, %al -; X86-NEXT: movl %eax, %edx -; X86-NEXT: notb %dl -; X86-NEXT: andb %cl, %dl -; X86-NEXT: addb %dl, %dl -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: notb %cl +; X86-NEXT: andb %dl, %cl +; X86-NEXT: addb %cl, %cl ; X86-NEXT: jne .LBB5_1 ; X86-NEXT: # %bb.2: # %bb12 ; X86-NEXT: retl @@ -213,12 +213,12 @@ ; X64-LIN-NEXT: .p2align 4 ; X64-LIN-NEXT: .LBB5_1: # %bb ; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-LIN-NEXT: movl %esi, %ecx ; X64-LIN-NEXT: xorb %sil, %al -; X64-LIN-NEXT: movl %eax, %ecx -; X64-LIN-NEXT: notb %cl -; X64-LIN-NEXT: andb %sil, %cl -; X64-LIN-NEXT: addb %cl, %cl -; X64-LIN-NEXT: movl %ecx, %esi +; X64-LIN-NEXT: movl %eax, %esi +; X64-LIN-NEXT: notb %sil +; X64-LIN-NEXT: andb %cl, %sil +; X64-LIN-NEXT: addb %sil, %sil ; X64-LIN-NEXT: jne .LBB5_1 ; X64-LIN-NEXT: # %bb.2: # %bb12 ; X64-LIN-NEXT: # kill: def $al killed $al killed $eax @@ -230,12 +230,12 @@ ; X64-WIN-NEXT: .p2align 4 ; X64-WIN-NEXT: .LBB5_1: # %bb ; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-WIN-NEXT: movl %edx, %ecx ; X64-WIN-NEXT: xorb %dl, %al -; X64-WIN-NEXT: movl %eax, %ecx -; X64-WIN-NEXT: notb %cl -; X64-WIN-NEXT: andb %dl, %cl -; X64-WIN-NEXT: addb %cl, %cl -; X64-WIN-NEXT: movl %ecx, %edx +; X64-WIN-NEXT: movl %eax, %edx +; X64-WIN-NEXT: notb %dl +; X64-WIN-NEXT: andb %cl, %dl +; X64-WIN-NEXT: addb %dl, %dl ; X64-WIN-NEXT: jne .LBB5_1 ; X64-WIN-NEXT: # %bb.2: # %bb12 ; X64-WIN-NEXT: retq @@ -262,12 +262,12 @@ ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB6_1: # %bb ; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: xorl $2147483646, %edx # imm = 0x7FFFFFFE -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: addl %edx, %edx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: xorl $2147483646, %ecx # imm = 0x7FFFFFFE +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: jne .LBB6_1 ; X86-NEXT: # %bb.2: # %bb12 ; X86-NEXT: retl @@ -278,12 +278,12 @@ ; X64-LIN-NEXT: .p2align 4 ; X64-LIN-NEXT: .LBB6_1: # %bb ; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-LIN-NEXT: movl %esi, %ecx ; X64-LIN-NEXT: xorl %esi, %eax -; X64-LIN-NEXT: movl %eax, %ecx -; X64-LIN-NEXT: xorl $2147483646, %ecx # imm = 0x7FFFFFFE -; X64-LIN-NEXT: andl %esi, %ecx -; X64-LIN-NEXT: addl %ecx, %ecx -; X64-LIN-NEXT: movl %ecx, %esi +; X64-LIN-NEXT: movl %eax, %esi +; X64-LIN-NEXT: xorl $2147483646, %esi # imm = 0x7FFFFFFE +; X64-LIN-NEXT: andl %ecx, %esi +; X64-LIN-NEXT: addl %esi, %esi ; X64-LIN-NEXT: jne .LBB6_1 ; X64-LIN-NEXT: # %bb.2: # %bb12 ; X64-LIN-NEXT: retq @@ -294,12 +294,12 @@ ; X64-WIN-NEXT: .p2align 4 ; X64-WIN-NEXT: .LBB6_1: # %bb ; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-WIN-NEXT: movl %edx, %ecx ; X64-WIN-NEXT: xorl %edx, %eax -; X64-WIN-NEXT: movl %eax, %ecx -; X64-WIN-NEXT: xorl $2147483646, %ecx # imm = 0x7FFFFFFE -; X64-WIN-NEXT: andl %edx, %ecx -; X64-WIN-NEXT: addl %ecx, %ecx -; X64-WIN-NEXT: movl %ecx, %edx +; X64-WIN-NEXT: movl %eax, %edx +; X64-WIN-NEXT: xorl $2147483646, %edx # imm = 0x7FFFFFFE +; X64-WIN-NEXT: andl %ecx, %edx +; X64-WIN-NEXT: addl %edx, %edx ; X64-WIN-NEXT: jne .LBB6_1 ; X64-WIN-NEXT: # %bb.2: # %bb12 ; X64-WIN-NEXT: retq