llvm/test/CodeGen/AMDGPU/lsr-cost-model-vector-iv.ll - llvm-project.git - Git at Google

 ; RUN: llc -O3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s

 ; Reduced from rocrand's threefry2x32_20 kernel.
 ; The AMDGPU LSR cost model should avoid creating a redundant VGPR induction
 ; variable when the loop already has a vector IV incremented by a uniform
 ; (SGPR) stride. Without the cost model fix, LSR introduces a second v_add
 ; in the loop body, wasting a VGPR and a VALU slot every iteration.

 declare i32 @llvm.amdgcn.workitem.id.x() #0

 ; CHECK-LABEL: {{^}}lsr_vector_iv_cost:
 ; The loop must contain exactly one VALU add — the single vector IV update.
 ; A second v_add_u32 here would mean LSR created a redundant IV.
 ; CHECK:      {{^}}.LBB0_1:
 ; CHECK:      v_add_u32
 ; CHECK-NOT:  v_add_u32
 ; CHECK:      s_cbranch
 define amdgpu_kernel void @lsr_vector_iv_cost(<2 x i32> %arg0, i32 %stride, ptr addrspace(1) %out) {
 entry:
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %loop

 loop:
   %iv.pn = phi i32 [ 0, %entry ], [ %or, %loop ]
   %iv.vec = phi i32 [ %tid, %entry ], [ %sum1, %loop ]
   %sum1 = add i32 %iv.vec, %stride
   %elt = extractelement <2 x i32> %arg0, i64 0
   %sum2 = add i32 %sum1, %elt
   %xor = xor i32 1, %sum2
   %sum3 = add i32 %sum2, %xor
   %sum4 = add i32 %sum3, %elt
   %or = or i32 %sum4, %stride
   %shr = lshr i32 %iv.pn, 1
   %cmp = icmp ult i32 %sum1, 1024
   br i1 %cmp, label %loop, label %exit

 exit:
   store i32 %or, ptr addrspace(1) %out
   ret void
 }

 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
	; RUN: llc -O3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s \| FileCheck %s

	; Reduced from rocrand's threefry2x32_20 kernel.
	; The AMDGPU LSR cost model should avoid creating a redundant VGPR induction
	; variable when the loop already has a vector IV incremented by a uniform
	; (SGPR) stride. Without the cost model fix, LSR introduces a second v_add
	; in the loop body, wasting a VGPR and a VALU slot every iteration.

	declare i32 @llvm.amdgcn.workitem.id.x() #0

	; CHECK-LABEL: {{^}}lsr_vector_iv_cost:
	; The loop must contain exactly one VALU add — the single vector IV update.
	; A second v_add_u32 here would mean LSR created a redundant IV.
	; CHECK: {{^}}.LBB0_1:
	; CHECK: v_add_u32
	; CHECK-NOT: v_add_u32
	; CHECK: s_cbranch
	define amdgpu_kernel void @lsr_vector_iv_cost(<2 x i32> %arg0, i32 %stride, ptr addrspace(1) %out) {
	entry:
	%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
	br label %loop

	loop:
	%iv.pn = phi i32 [ 0, %entry ], [ %or, %loop ]
	%iv.vec = phi i32 [ %tid, %entry ], [ %sum1, %loop ]
	%sum1 = add i32 %iv.vec, %stride
	%elt = extractelement <2 x i32> %arg0, i64 0
	%sum2 = add i32 %sum1, %elt
	%xor = xor i32 1, %sum2
	%sum3 = add i32 %sum2, %xor
	%sum4 = add i32 %sum3, %elt
	%or = or i32 %sum4, %stride
	%shr = lshr i32 %iv.pn, 1
	%cmp = icmp ult i32 %sum1, 1024
	br i1 %cmp, label %loop, label %exit

	exit:
	store i32 %or, ptr addrspace(1) %out
	ret void
	}

	attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }