| # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 |
| # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -o - %s | FileCheck %s |
| # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass=si-insert-waitcnts -o - %s | FileCheck %s |
| |
| # Test for the fix that removed the incorrect S_BARRIER check for DS stores. |
| # Previously, the code would reset SeenDSStoreInCurrMBB when encountering an |
| # S_BARRIER, incorrectly assuming that stores postdominated by a barrier would |
| # be waited at the barrier. This was wrong because: |
| # 1. S_BARRIER without AutoWaitcntBeforeBarrier does not automatically wait for DS stores to complete |
| # 2. S_BARRIER with BackOffBarrier feature does not later flush memory ops by adding ZERO waitcnt |
| # |
| # This test ensures that when a loop has a DS store followed by S_BARRIER, |
| # the preheader flush optimization is NOT applied (no S_WAIT_DSCNT in preheader). |
| # The wait should happen inside the loop instead. |
| |
| --- |
| # Test: DS store followed by S_BARRIER in loop. |
| # DS load in preheader, value used in loop. |
| # The preheader should NOT have S_WAIT_DSCNT because SeenDSStoreInLoop = true. |
| # Instead, the wait should be inside the loop. |
| name: ds_store_barrier_no_preheader_flush |
| tracksRegLiveness: true |
| machineFunctionInfo: |
| isEntryFunction: true |
| body: | |
| ; CHECK-LABEL: name: ds_store_barrier_no_preheader_flush |
| ; CHECK: bb.0: |
| ; CHECK-NEXT: successors: %bb.1(0x80000000) |
| ; CHECK-NEXT: liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2 |
| ; CHECK-NEXT: {{ $}} |
| ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec |
| ; Verify NO S_WAIT_DSCNT in preheader - the wait must be inside the loop |
| ; CHECK-NOT: S_WAIT_DSCNT |
| ; CHECK-NEXT: S_BRANCH %bb.1 |
| ; CHECK-NEXT: {{ $}} |
| ; CHECK-NEXT: bb.1: |
| ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) |
| ; CHECK-NEXT: liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr10 |
| ; CHECK-NEXT: {{ $}} |
| ; CHECK-NEXT: S_WAIT_DSCNT 0 |
| ; CHECK-NEXT: $vgpr30 = V_ADD_F32_e32 $vgpr10, $vgpr1, implicit $mode, implicit $exec |
| ; CHECK-NEXT: DS_WRITE_B32 $vgpr2, $vgpr30, 0, 0, implicit $m0, implicit $exec |
| ; With BackOffBarrier, no S_WAIT_DSCNT needed before S_BARRIER |
| ; CHECK-NEXT: S_BARRIER |
| ; CHECK-NEXT: $sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc |
| ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc |
| ; CHECK-NEXT: S_BRANCH %bb.2 |
| ; CHECK-NEXT: {{ $}} |
| ; CHECK-NEXT: bb.2: |
| ; CHECK-NEXT: S_ENDPGM 0 |
| bb.0: |
| successors: %bb.1 |
| liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2 |
| |
| ; Preheader: DS load |
| $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec |
| S_BRANCH %bb.1 |
| |
| bb.1: |
| successors: %bb.1, %bb.2 |
| liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr10 |
| |
| ; Use DS-loaded value from preheader |
| $vgpr30 = V_ADD_F32_e32 $vgpr10, $vgpr1, implicit $mode, implicit $exec |
| |
| ; DS store followed by barrier - this should NOT reset SeenDSStoreInCurrMBB |
| DS_WRITE_B32 $vgpr2, $vgpr30, 0, 0, implicit $m0, implicit $exec |
| S_BARRIER |
| |
| ; Loop control |
| $sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc |
| S_CBRANCH_SCC1 %bb.1, implicit $scc |
| S_BRANCH %bb.2 |
| |
| bb.2: |
| S_ENDPGM 0 |
| ... |