blob: acd6d285c463c7df5b51cae32c1f45fcd853c5f9 [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --version 6
; RUN: opt -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -passes=loop-unroll -unroll-runtime -unroll-allow-partial -S < %s | FileCheck %s
declare void @llvm.amdgcn.s.barrier() #0
; This loop has a convergent barrier and a runtime trip count that depends on
; a uniform value (kernel argument, which is passed in SGPR). Since the trip
; count is uniform across all threads, runtime unrolling with a remainder is
; safe and should be performed.
;
define amdgpu_kernel void @runtime_unroll_uniform_trip_count(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture %in, i32 %n) #1 {
; CHECK-LABEL: define amdgpu_kernel void @runtime_unroll_uniform_trip_count(ptr addrspace(1) noalias captures(none) %out, ptr addrspace(1) noalias captures(none) %in, i32 %n) #1 {
; CHECK-NEXT: entry:
; CHECK-NEXT: %0 = add i32 %n, -1
; CHECK-NEXT: %xtraiter = and i32 %n, 7
; CHECK-NEXT: %1 = icmp ult i32 %0, 7
; CHECK-NEXT: br i1 %1, label %for.body.epil.preheader, label %entry.new
; CHECK: entry.new:
; CHECK-NEXT: %unroll_iter = sub i32 %n, %xtraiter
; CHECK-NEXT: br label %for.body
; CHECK: for.body:
; CHECK-NEXT: %indvars.iv = phi i32 [ 0, %entry.new ], [ %indvars.iv.next.7, %for.body ]
; CHECK-NEXT: %sum.02 = phi i32 [ 0, %entry.new ], [ %add.7, %for.body ]
; CHECK-NEXT: %niter = phi i32 [ 0, %entry.new ], [ %niter.next.7, %for.body ]
; CHECK-NEXT: %arrayidx.in = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv
; CHECK-NEXT: %arrayidx.out = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv
; CHECK-NEXT: %load = load i32, ptr addrspace(1) %arrayidx.in, align 4
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() #4
; CHECK-NEXT: %add = add i32 %load, %sum.02
; CHECK-NEXT: store i32 %add, ptr addrspace(1) %arrayidx.out, align 4
; CHECK-NEXT: %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
; CHECK-NEXT: %arrayidx.in.1 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv.next
; CHECK-NEXT: %arrayidx.out.1 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv.next
; CHECK-NEXT: %load.1 = load i32, ptr addrspace(1) %arrayidx.in.1, align 4
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() #4
; CHECK-NEXT: %add.1 = add i32 %load.1, %add
; CHECK-NEXT: store i32 %add.1, ptr addrspace(1) %arrayidx.out.1, align 4
; CHECK-NEXT: %indvars.iv.next.1 = add nuw nsw i32 %indvars.iv, 2
; CHECK-NEXT: %arrayidx.in.2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv.next.1
; CHECK-NEXT: %arrayidx.out.2 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv.next.1
; CHECK-NEXT: %load.2 = load i32, ptr addrspace(1) %arrayidx.in.2, align 4
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() #4
; CHECK-NEXT: %add.2 = add i32 %load.2, %add.1
; CHECK-NEXT: store i32 %add.2, ptr addrspace(1) %arrayidx.out.2, align 4
; CHECK-NEXT: %indvars.iv.next.2 = add nuw nsw i32 %indvars.iv, 3
; CHECK-NEXT: %arrayidx.in.3 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv.next.2
; CHECK-NEXT: %arrayidx.out.3 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv.next.2
; CHECK-NEXT: %load.3 = load i32, ptr addrspace(1) %arrayidx.in.3, align 4
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() #4
; CHECK-NEXT: %add.3 = add i32 %load.3, %add.2
; CHECK-NEXT: store i32 %add.3, ptr addrspace(1) %arrayidx.out.3, align 4
; CHECK-NEXT: %indvars.iv.next.3 = add nuw nsw i32 %indvars.iv, 4
; CHECK-NEXT: %arrayidx.in.4 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv.next.3
; CHECK-NEXT: %arrayidx.out.4 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv.next.3
; CHECK-NEXT: %load.4 = load i32, ptr addrspace(1) %arrayidx.in.4, align 4
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() #4
; CHECK-NEXT: %add.4 = add i32 %load.4, %add.3
; CHECK-NEXT: store i32 %add.4, ptr addrspace(1) %arrayidx.out.4, align 4
; CHECK-NEXT: %indvars.iv.next.4 = add nuw nsw i32 %indvars.iv, 5
; CHECK-NEXT: %arrayidx.in.5 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv.next.4
; CHECK-NEXT: %arrayidx.out.5 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv.next.4
; CHECK-NEXT: %load.5 = load i32, ptr addrspace(1) %arrayidx.in.5, align 4
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() #4
; CHECK-NEXT: %add.5 = add i32 %load.5, %add.4
; CHECK-NEXT: store i32 %add.5, ptr addrspace(1) %arrayidx.out.5, align 4
; CHECK-NEXT: %indvars.iv.next.5 = add nuw nsw i32 %indvars.iv, 6
; CHECK-NEXT: %arrayidx.in.6 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv.next.5
; CHECK-NEXT: %arrayidx.out.6 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv.next.5
; CHECK-NEXT: %load.6 = load i32, ptr addrspace(1) %arrayidx.in.6, align 4
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() #4
; CHECK-NEXT: %add.6 = add i32 %load.6, %add.5
; CHECK-NEXT: store i32 %add.6, ptr addrspace(1) %arrayidx.out.6, align 4
; CHECK-NEXT: %indvars.iv.next.6 = add nuw nsw i32 %indvars.iv, 7
; CHECK-NEXT: %arrayidx.in.7 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv.next.6
; CHECK-NEXT: %arrayidx.out.7 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv.next.6
; CHECK-NEXT: %load.7 = load i32, ptr addrspace(1) %arrayidx.in.7, align 4
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() #4
; CHECK-NEXT: %add.7 = add i32 %load.7, %add.6
; CHECK-NEXT: store i32 %add.7, ptr addrspace(1) %arrayidx.out.7, align 4
; CHECK-NEXT: %indvars.iv.next.7 = add i32 %indvars.iv, 8
; CHECK-NEXT: %niter.next.7 = add i32 %niter, 8
; CHECK-NEXT: %niter.ncmp.7 = icmp eq i32 %niter.next.7, %unroll_iter
; CHECK-NEXT: br i1 %niter.ncmp.7, label %for.end.unr-lcssa, label %for.body
; CHECK: for.end.unr-lcssa:
; CHECK-NEXT: %indvars.iv.unr = phi i32 [ %indvars.iv.next.7, %for.body ]
; CHECK-NEXT: %sum.02.unr = phi i32 [ %add.7, %for.body ]
; CHECK-NEXT: %lcmp.mod = icmp ne i32 %xtraiter, 0
; CHECK-NEXT: br i1 %lcmp.mod, label %for.body.epil.preheader, label %for.end
; CHECK: for.body.epil.preheader:
; CHECK-NEXT: %indvars.iv.epil.init = phi i32 [ 0, %entry ], [ %indvars.iv.unr, %for.end.unr-lcssa ]
; CHECK-NEXT: %sum.02.epil.init = phi i32 [ 0, %entry ], [ %sum.02.unr, %for.end.unr-lcssa ]
; CHECK-NEXT: %lcmp.mod1 = icmp ne i32 %xtraiter, 0
; CHECK-NEXT: call void @llvm.assume(i1 %lcmp.mod1)
; CHECK-NEXT: br label %for.body.epil
; CHECK: for.body.epil:
; CHECK-NEXT: %indvars.iv.epil = phi i32 [ %indvars.iv.next.epil, %for.body.epil ], [ %indvars.iv.epil.init, %for.body.epil.preheader ]
; CHECK-NEXT: %sum.02.epil = phi i32 [ %add.epil, %for.body.epil ], [ %sum.02.epil.init, %for.body.epil.preheader ]
; CHECK-NEXT: %epil.iter = phi i32 [ 0, %for.body.epil.preheader ], [ %epil.iter.next, %for.body.epil ]
; CHECK-NEXT: %arrayidx.in.epil = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv.epil
; CHECK-NEXT: %arrayidx.out.epil = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv.epil
; CHECK-NEXT: %load.epil = load i32, ptr addrspace(1) %arrayidx.in.epil, align 4
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() #4
; CHECK-NEXT: %add.epil = add i32 %load.epil, %sum.02.epil
; CHECK-NEXT: store i32 %add.epil, ptr addrspace(1) %arrayidx.out.epil, align 4
; CHECK-NEXT: %indvars.iv.next.epil = add i32 %indvars.iv.epil, 1
; CHECK-NEXT: %exitcond.epil = icmp eq i32 %indvars.iv.next.epil, %n
; CHECK-NEXT: %epil.iter.next = add i32 %epil.iter, 1
; CHECK-NEXT: %epil.iter.cmp = icmp ne i32 %epil.iter.next, %xtraiter
; CHECK-NEXT: br i1 %epil.iter.cmp, label %for.body.epil, label %for.end.epilog-lcssa, !llvm.loop !0
; CHECK: for.end.epilog-lcssa:
; CHECK-NEXT: br label %for.end
; CHECK: for.end:
; CHECK-NEXT: ret void
;
entry:
br label %for.body
for.body:
%indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%sum.02 = phi i32 [ %add, %for.body ], [ 0, %entry ]
%arrayidx.in = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv
%arrayidx.out = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv
%load = load i32, ptr addrspace(1) %arrayidx.in
call void @llvm.amdgcn.s.barrier() #0
%add = add i32 %load, %sum.02
store i32 %add, ptr addrspace(1) %arrayidx.out
%indvars.iv.next = add i32 %indvars.iv, 1
%exitcond = icmp eq i32 %indvars.iv.next, %n
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret void
}
; This loop has a convergent barrier and a divergent trip count (derived from
; llvm.amdgcn.workitem.id.x, which is divergent). Since the trip count is
; divergent, runtime unrolling with a remainder must still be blocked.
;
define amdgpu_kernel void @runtime_unroll_divergent_trip_count(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture %in) #1 {
; CHECK-LABEL: define amdgpu_kernel void @runtime_unroll_divergent_trip_count(ptr addrspace(1) noalias captures(none) %out, ptr addrspace(1) noalias captures(none) %in) #1 {
; CHECK-NEXT: entry:
; CHECK-NEXT: %tid = call i32 @llvm.amdgcn.workitem.id.x()
; CHECK-NEXT: %n = add i32 %tid, 1
; CHECK-NEXT: br label %for.body
; CHECK: for.body:
; CHECK-NEXT: %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
; CHECK-NEXT: %sum.02 = phi i32 [ %add, %for.body ], [ 0, %entry ]
; CHECK-NEXT: %arrayidx.in = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv
; CHECK-NEXT: %arrayidx.out = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv
; CHECK-NEXT: %load = load i32, ptr addrspace(1) %arrayidx.in, align 4
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() #4
; CHECK-NEXT: %add = add i32 %load, %sum.02
; CHECK-NEXT: store i32 %add, ptr addrspace(1) %arrayidx.out, align 4
; CHECK-NEXT: %indvars.iv.next = add i32 %indvars.iv, 1
; CHECK-NEXT: %exitcond = icmp eq i32 %indvars.iv.next, %n
; CHECK-NEXT: br i1 %exitcond, label %for.end, label %for.body
; CHECK: for.end:
; CHECK-NEXT: ret void
;
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%n = add i32 %tid, 1
br label %for.body
for.body:
%indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%sum.02 = phi i32 [ %add, %for.body ], [ 0, %entry ]
%arrayidx.in = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv
%arrayidx.out = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv
%load = load i32, ptr addrspace(1) %arrayidx.in
call void @llvm.amdgcn.s.barrier() #0
%add = add i32 %load, %sum.02
store i32 %add, ptr addrspace(1) %arrayidx.out
%indvars.iv.next = add i32 %indvars.iv, 1
%exitcond = icmp eq i32 %indvars.iv.next, %n
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { nounwind convergent }
attributes #1 = { nounwind }
attributes #2 = { nounwind readnone willreturn }