blob: f8a53b7749089fb66de10aceacecaaa54d76a780 [file] [log] [blame] [edit]
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -o - %s | FileCheck %s
---
# Test 1: Simple case - DS loads only in preheader, no DS loads in loop.
# The optimization flushes DSCNT in the preheader so that
# subsequent loop iterations don't need to wait inside the loop.
name: ds_preheader_flush_simple
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
; CHECK-LABEL: name: ds_preheader_flush_simple
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $sgpr0, $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec
; CHECK-NEXT: S_WAIT_DSCNT 0
; CHECK-NEXT: S_BRANCH %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $vgpr14, $vgpr18, $vgpr22
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr30 = V_ADD_F32_e32 $vgpr10, $vgpr14, implicit $mode, implicit $exec
; CHECK-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr18, $vgpr22, implicit $mode, implicit $exec
; CHECK-NEXT: $sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc
; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: S_ENDPGM 0
bb.0:
successors: %bb.1
liveins: $sgpr0, $vgpr0
; Preheader: DS loads
$vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
$vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec
$vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec
$vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
liveins: $sgpr0, $vgpr0, $vgpr10, $vgpr14, $vgpr18, $vgpr22
; Use DS-loaded values (no DS loads inside the loop)
$vgpr30 = V_ADD_F32_e32 $vgpr10, $vgpr14, implicit $mode, implicit $exec
$vgpr31 = V_ADD_F32_e32 $vgpr18, $vgpr22, implicit $mode, implicit $exec
; Loop control
$sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
# Test 2: Prefetch pattern - DS loads both in preheader AND at end of loop.
# USE comes before DEF (prefetch pattern): values are used first, then
# new values are loaded for the next iteration. Since the DEF happens
# AFTER the USE in program order, we can still flush in preheader.
# This helps iteration 1 avoid waiting inside the loop.
#
# The cascading wait pattern shows decreasing dscnt values (12, 8, 4, 0)
# as each group of registers becomes ready.
name: ds_loop_prefetch_pattern
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
; CHECK-LABEL: name: ds_loop_prefetch_pattern
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $sgpr0, $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr70_vgpr71_vgpr72_vgpr73 = DS_READ_B128 $vgpr0, 240, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr66_vgpr67_vgpr68_vgpr69 = DS_READ_B128 $vgpr0, 224, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr62_vgpr63_vgpr64_vgpr65 = DS_READ_B128 $vgpr0, 208, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr58_vgpr59_vgpr60_vgpr61 = DS_READ_B128 $vgpr0, 192, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr54_vgpr55_vgpr56_vgpr57 = DS_READ_B128 $vgpr0, 176, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr50_vgpr51_vgpr52_vgpr53 = DS_READ_B128 $vgpr0, 160, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr46_vgpr47_vgpr48_vgpr49 = DS_READ_B128 $vgpr0, 144, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr42_vgpr43_vgpr44_vgpr45 = DS_READ_B128 $vgpr0, 128, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr38_vgpr39_vgpr40_vgpr41 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr34_vgpr35_vgpr36_vgpr37 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr30_vgpr31_vgpr32_vgpr33 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr26_vgpr27_vgpr28_vgpr29 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
; CHECK-NEXT: S_WAIT_DSCNT 0
; CHECK-NEXT: S_BRANCH %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_WAIT_DSCNT 12
; CHECK-NEXT: early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_WAIT_DSCNT 8
; CHECK-NEXT: early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_WAIT_DSCNT 4
; CHECK-NEXT: early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_WAIT_DSCNT 0
; CHECK-NEXT: early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_BARRIER
; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr26_vgpr27_vgpr28_vgpr29 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr30_vgpr31_vgpr32_vgpr33 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr34_vgpr35_vgpr36_vgpr37 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr38_vgpr39_vgpr40_vgpr41 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr42_vgpr43_vgpr44_vgpr45 = DS_READ_B128 $vgpr0, 128, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr46_vgpr47_vgpr48_vgpr49 = DS_READ_B128 $vgpr0, 144, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr50_vgpr51_vgpr52_vgpr53 = DS_READ_B128 $vgpr0, 160, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr54_vgpr55_vgpr56_vgpr57 = DS_READ_B128 $vgpr0, 176, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr58_vgpr59_vgpr60_vgpr61 = DS_READ_B128 $vgpr0, 192, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr62_vgpr63_vgpr64_vgpr65 = DS_READ_B128 $vgpr0, 208, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr66_vgpr67_vgpr68_vgpr69 = DS_READ_B128 $vgpr0, 224, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr70_vgpr71_vgpr72_vgpr73 = DS_READ_B128 $vgpr0, 240, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc
; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: S_ENDPGM 0
bb.0:
successors: %bb.1
liveins: $sgpr0, $vgpr0
; Preheader: DS loads for first iteration
$vgpr70_vgpr71_vgpr72_vgpr73 = DS_READ_B128 $vgpr0, 240, 0, implicit $m0, implicit $exec
$vgpr66_vgpr67_vgpr68_vgpr69 = DS_READ_B128 $vgpr0, 224, 0, implicit $m0, implicit $exec
$vgpr62_vgpr63_vgpr64_vgpr65 = DS_READ_B128 $vgpr0, 208, 0, implicit $m0, implicit $exec
$vgpr58_vgpr59_vgpr60_vgpr61 = DS_READ_B128 $vgpr0, 192, 0, implicit $m0, implicit $exec
$vgpr54_vgpr55_vgpr56_vgpr57 = DS_READ_B128 $vgpr0, 176, 0, implicit $m0, implicit $exec
$vgpr50_vgpr51_vgpr52_vgpr53 = DS_READ_B128 $vgpr0, 160, 0, implicit $m0, implicit $exec
$vgpr46_vgpr47_vgpr48_vgpr49 = DS_READ_B128 $vgpr0, 144, 0, implicit $m0, implicit $exec
$vgpr42_vgpr43_vgpr44_vgpr45 = DS_READ_B128 $vgpr0, 128, 0, implicit $m0, implicit $exec
$vgpr38_vgpr39_vgpr40_vgpr41 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
$vgpr34_vgpr35_vgpr36_vgpr37 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
$vgpr30_vgpr31_vgpr32_vgpr33 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
$vgpr26_vgpr27_vgpr28_vgpr29 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
$vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec
$vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec
$vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec
$vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
liveins: $sgpr0, $vgpr0, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
; WMMA using vgpr10-25 (loaded in preheader or previous iteration's prefetch)
early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
; More WMMAs using other registers
early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
; Repeat with same registers
early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
; Barrier
S_BARRIER
; Prefetch DS loads for next iteration (this prevents simple optimization)
$vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
$vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec
$vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec
$vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec
$vgpr26_vgpr27_vgpr28_vgpr29 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
$vgpr30_vgpr31_vgpr32_vgpr33 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
$vgpr34_vgpr35_vgpr36_vgpr37 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
$vgpr38_vgpr39_vgpr40_vgpr41 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
$vgpr42_vgpr43_vgpr44_vgpr45 = DS_READ_B128 $vgpr0, 128, 0, implicit $m0, implicit $exec
$vgpr46_vgpr47_vgpr48_vgpr49 = DS_READ_B128 $vgpr0, 144, 0, implicit $m0, implicit $exec
$vgpr50_vgpr51_vgpr52_vgpr53 = DS_READ_B128 $vgpr0, 160, 0, implicit $m0, implicit $exec
$vgpr54_vgpr55_vgpr56_vgpr57 = DS_READ_B128 $vgpr0, 176, 0, implicit $m0, implicit $exec
$vgpr58_vgpr59_vgpr60_vgpr61 = DS_READ_B128 $vgpr0, 192, 0, implicit $m0, implicit $exec
$vgpr62_vgpr63_vgpr64_vgpr65 = DS_READ_B128 $vgpr0, 208, 0, implicit $m0, implicit $exec
$vgpr66_vgpr67_vgpr68_vgpr69 = DS_READ_B128 $vgpr0, 224, 0, implicit $m0, implicit $exec
$vgpr70_vgpr71_vgpr72_vgpr73 = DS_READ_B128 $vgpr0, 240, 0, implicit $m0, implicit $exec
; Loop control
$sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...