blob: 39068b42fcf3a01af4bab088c33f4fce68b0d0c3 [file] [log] [blame] [edit]
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -o - %s | FileCheck %s
---
# Test: DS prefetch with flush points in single-block loop.
# Preheader has DS load. Loop has DS loads where some are used in same iteration
# (creating flush points) but others remain unflushed at backedge (prefetches).
# Expected: s_wait_dscnt 0 in preheader, non-zero waits in loop.
name: ds_prefetch_flushed
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
; CHECK-LABEL: name: ds_prefetch_flushed
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $sgpr0, $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr28 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr32 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
; CHECK-NEXT: S_WAIT_DSCNT 0
; CHECK-NEXT: S_BRANCH %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10_vgpr11_vgpr12_vgpr13, $vgpr28, $vgpr32
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr50 = V_ADD_F32_e32 $vgpr10, $vgpr11, implicit $mode, implicit $exec
; CHECK-NEXT: S_WAIT_DSCNT 1
; CHECK-NEXT: $vgpr51 = V_ADD_F32_e32 $vgpr28, $vgpr28, implicit $mode, implicit $exec
; CHECK-NEXT: S_BARRIER
; CHECK-NEXT: $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
; CHECK-NEXT: S_WAIT_DSCNT 2
; CHECK-NEXT: $vgpr52 = V_ADD_F32_e32 $vgpr32, $vgpr32, implicit $mode, implicit $exec
; CHECK-NEXT: $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr32_vgpr33_vgpr34_vgpr35 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
; CHECK-NEXT: S_WAIT_DSCNT 2
; CHECK-NEXT: $vgpr53 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
; CHECK-NEXT: $sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc
; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: S_ENDPGM 0
bb.0:
successors: %bb.1
liveins: $sgpr0, $vgpr0
; Preheader: DS load for use inside loop
$vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
$vgpr28 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
$vgpr32 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
liveins: $sgpr0, $vgpr0, $vgpr10_vgpr11_vgpr12_vgpr13, $vgpr28, $vgpr32
; Use preheader value
$vgpr50 = V_ADD_F32_e32 $vgpr10, $vgpr11, implicit $mode, implicit $exec
; Use preheader and prefetched value
$vgpr51 = V_ADD_F32_e32 $vgpr28, $vgpr28, implicit $mode, implicit $exec
; Barrier
S_BARRIER
; First two will be "flushed" by same-iteration use below
$vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
$vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
; Use preheader and prefetched value
$vgpr52 = V_ADD_F32_e32 $vgpr32, $vgpr32, implicit $mode, implicit $exec
; These two remain unflushed - true prefetches for next iteration
$vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
$vgpr32_vgpr33_vgpr34_vgpr35 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
; Use vgpr24 - creates flush point, flushes loads 1-2
$vgpr53 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
; Loop control
$sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...