[AMDGPU] Do not put memory instructions in *ALU SchedGroups (#162560)
Classifying some memory instructions as VALU leads to unexpected
behavior from the sched*barrier intrinsics.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index dbe74b1..5700468 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2394,15 +2394,19 @@
else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
(TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
TII->isTRANS(MI)))
- Result = true;
+ Result = !MI.mayLoadOrStore();
else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
- TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI))
- Result = true;
+ TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) {
+ // Some memory instructions may be marked as VALU (e.g. BUFFER_LOAD_*_LDS).
+ // For our purposes, these shall not be classified as VALU as this results
+ // in unexpected behavior.
+ Result = !MI.mayLoadOrStore();
+ }
else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
TII->isSALU(MI))
- Result = true;
+ Result = !MI.mayLoadOrStore();
else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
TII->isMFMAorWMMA(MI))
diff --git a/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir b/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir
new file mode 100644
index 0000000..a4aad57
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir
@@ -0,0 +1,59 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler -o - %s | FileCheck %s
+
+---
+name: buffer_load_lds_not_valu
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; CHECK-LABEL: name: buffer_load_lds_not_valu
+ ; CHECK: liveins: $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $exec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF2]], [[DEF3]], implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF3]], [[V_ADD_U32_e32_]], implicit $exec
+ ; CHECK-NEXT: $m0 = S_MOV_B32 0
+ ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, implicit $exec, implicit $m0
+ ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]], implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]], implicit $exec
+ ; CHECK-NEXT: $m0 = S_MOV_B32 1
+ ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, implicit $exec, implicit $m0
+ ; CHECK-NEXT: [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]], implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_3]], [[V_ADD_U32_e32_4]], implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_4]], [[V_ADD_U32_e32_5]], implicit $exec
+ ; CHECK-NEXT: dead [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_5]], [[V_ADD_U32_e32_6]], implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 4, 0
+ ; CHECK-NEXT: S_ENDPGM 0
+ $exec = IMPLICIT_DEF
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sgpr_128 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = IMPLICIT_DEF
+ %4:vgpr_32 = V_ADD_U32_e32 %2, %3, implicit $exec
+ %5:vgpr_32 = V_ADD_U32_e32 %3, %4, implicit $exec
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, implicit $exec, implicit $m0
+ $m0 = S_MOV_B32 1
+ BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, implicit $exec, implicit $m0
+ %6:vgpr_32 = V_ADD_U32_e32 %4, %5, implicit $exec
+ %7:vgpr_32 = V_ADD_U32_e32 %5, %6, implicit $exec
+ %8:vgpr_32 = V_ADD_U32_e32 %6, %7, implicit $exec
+ %9:vgpr_32 = V_ADD_U32_e32 %7, %8, implicit $exec
+ %10:vgpr_32 = V_ADD_U32_e32 %8, %9, implicit $exec
+ %11:vgpr_32 = V_ADD_U32_e32 %9, %10, implicit $exec
+ SCHED_GROUP_BARRIER 2, 2, 0
+ SCHED_GROUP_BARRIER 4, 1 ,0
+ SCHED_GROUP_BARRIER 2, 2, 0
+ SCHED_GROUP_BARRIER 4, 1 ,0
+ SCHED_GROUP_BARRIER 2, 4, 0
+ S_ENDPGM 0
+...