| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --version 6 |
| ; RUN: opt -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -passes=loop-unroll -unroll-runtime -unroll-allow-partial -S < %s | FileCheck %s |
| |
| declare void @llvm.amdgcn.s.barrier() #0 |
| |
| ; This loop has a convergent barrier and a runtime trip count that depends on |
| ; a uniform value (kernel argument, which is passed in SGPR). Since the trip |
| ; count is uniform across all threads, runtime unrolling with a remainder is |
| ; safe and should be performed. |
| ; |
| define amdgpu_kernel void @runtime_unroll_uniform_trip_count(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture %in, i32 %n) #1 { |
| ; CHECK-LABEL: define amdgpu_kernel void @runtime_unroll_uniform_trip_count(ptr addrspace(1) noalias captures(none) %out, ptr addrspace(1) noalias captures(none) %in, i32 %n) #1 { |
| ; CHECK-NEXT: entry: |
| ; CHECK-NEXT: %0 = add i32 %n, -1 |
| ; CHECK-NEXT: %xtraiter = and i32 %n, 7 |
| ; CHECK-NEXT: %1 = icmp ult i32 %0, 7 |
| ; CHECK-NEXT: br i1 %1, label %for.body.epil.preheader, label %entry.new |
| ; CHECK: entry.new: |
| ; CHECK-NEXT: %unroll_iter = sub i32 %n, %xtraiter |
| ; CHECK-NEXT: br label %for.body |
| ; CHECK: for.body: |
| ; CHECK-NEXT: %indvars.iv = phi i32 [ 0, %entry.new ], [ %indvars.iv.next.7, %for.body ] |
| ; CHECK-NEXT: %sum.02 = phi i32 [ 0, %entry.new ], [ %add.7, %for.body ] |
| ; CHECK-NEXT: %niter = phi i32 [ 0, %entry.new ], [ %niter.next.7, %for.body ] |
| ; CHECK-NEXT: %arrayidx.in = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv |
| ; CHECK-NEXT: %arrayidx.out = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv |
| ; CHECK-NEXT: %load = load i32, ptr addrspace(1) %arrayidx.in, align 4 |
| ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() #4 |
| ; CHECK-NEXT: %add = add i32 %load, %sum.02 |
| ; CHECK-NEXT: store i32 %add, ptr addrspace(1) %arrayidx.out, align 4 |
| ; CHECK-NEXT: %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 |
| ; CHECK-NEXT: %arrayidx.in.1 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv.next |
| ; CHECK-NEXT: %arrayidx.out.1 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv.next |
| ; CHECK-NEXT: %load.1 = load i32, ptr addrspace(1) %arrayidx.in.1, align 4 |
| ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() #4 |
| ; CHECK-NEXT: %add.1 = add i32 %load.1, %add |
| ; CHECK-NEXT: store i32 %add.1, ptr addrspace(1) %arrayidx.out.1, align 4 |
| ; CHECK-NEXT: %indvars.iv.next.1 = add nuw nsw i32 %indvars.iv, 2 |
| ; CHECK-NEXT: %arrayidx.in.2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv.next.1 |
| ; CHECK-NEXT: %arrayidx.out.2 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv.next.1 |
| ; CHECK-NEXT: %load.2 = load i32, ptr addrspace(1) %arrayidx.in.2, align 4 |
| ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() #4 |
| ; CHECK-NEXT: %add.2 = add i32 %load.2, %add.1 |
| ; CHECK-NEXT: store i32 %add.2, ptr addrspace(1) %arrayidx.out.2, align 4 |
| ; CHECK-NEXT: %indvars.iv.next.2 = add nuw nsw i32 %indvars.iv, 3 |
| ; CHECK-NEXT: %arrayidx.in.3 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv.next.2 |
| ; CHECK-NEXT: %arrayidx.out.3 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv.next.2 |
| ; CHECK-NEXT: %load.3 = load i32, ptr addrspace(1) %arrayidx.in.3, align 4 |
| ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() #4 |
| ; CHECK-NEXT: %add.3 = add i32 %load.3, %add.2 |
| ; CHECK-NEXT: store i32 %add.3, ptr addrspace(1) %arrayidx.out.3, align 4 |
| ; CHECK-NEXT: %indvars.iv.next.3 = add nuw nsw i32 %indvars.iv, 4 |
| ; CHECK-NEXT: %arrayidx.in.4 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv.next.3 |
| ; CHECK-NEXT: %arrayidx.out.4 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv.next.3 |
| ; CHECK-NEXT: %load.4 = load i32, ptr addrspace(1) %arrayidx.in.4, align 4 |
| ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() #4 |
| ; CHECK-NEXT: %add.4 = add i32 %load.4, %add.3 |
| ; CHECK-NEXT: store i32 %add.4, ptr addrspace(1) %arrayidx.out.4, align 4 |
| ; CHECK-NEXT: %indvars.iv.next.4 = add nuw nsw i32 %indvars.iv, 5 |
| ; CHECK-NEXT: %arrayidx.in.5 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv.next.4 |
| ; CHECK-NEXT: %arrayidx.out.5 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv.next.4 |
| ; CHECK-NEXT: %load.5 = load i32, ptr addrspace(1) %arrayidx.in.5, align 4 |
| ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() #4 |
| ; CHECK-NEXT: %add.5 = add i32 %load.5, %add.4 |
| ; CHECK-NEXT: store i32 %add.5, ptr addrspace(1) %arrayidx.out.5, align 4 |
| ; CHECK-NEXT: %indvars.iv.next.5 = add nuw nsw i32 %indvars.iv, 6 |
| ; CHECK-NEXT: %arrayidx.in.6 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv.next.5 |
| ; CHECK-NEXT: %arrayidx.out.6 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv.next.5 |
| ; CHECK-NEXT: %load.6 = load i32, ptr addrspace(1) %arrayidx.in.6, align 4 |
| ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() #4 |
| ; CHECK-NEXT: %add.6 = add i32 %load.6, %add.5 |
| ; CHECK-NEXT: store i32 %add.6, ptr addrspace(1) %arrayidx.out.6, align 4 |
| ; CHECK-NEXT: %indvars.iv.next.6 = add nuw nsw i32 %indvars.iv, 7 |
| ; CHECK-NEXT: %arrayidx.in.7 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv.next.6 |
| ; CHECK-NEXT: %arrayidx.out.7 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv.next.6 |
| ; CHECK-NEXT: %load.7 = load i32, ptr addrspace(1) %arrayidx.in.7, align 4 |
| ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() #4 |
| ; CHECK-NEXT: %add.7 = add i32 %load.7, %add.6 |
| ; CHECK-NEXT: store i32 %add.7, ptr addrspace(1) %arrayidx.out.7, align 4 |
| ; CHECK-NEXT: %indvars.iv.next.7 = add i32 %indvars.iv, 8 |
| ; CHECK-NEXT: %niter.next.7 = add i32 %niter, 8 |
| ; CHECK-NEXT: %niter.ncmp.7 = icmp eq i32 %niter.next.7, %unroll_iter |
| ; CHECK-NEXT: br i1 %niter.ncmp.7, label %for.end.unr-lcssa, label %for.body |
| ; CHECK: for.end.unr-lcssa: |
| ; CHECK-NEXT: %indvars.iv.unr = phi i32 [ %indvars.iv.next.7, %for.body ] |
| ; CHECK-NEXT: %sum.02.unr = phi i32 [ %add.7, %for.body ] |
| ; CHECK-NEXT: %lcmp.mod = icmp ne i32 %xtraiter, 0 |
| ; CHECK-NEXT: br i1 %lcmp.mod, label %for.body.epil.preheader, label %for.end |
| ; CHECK: for.body.epil.preheader: |
| ; CHECK-NEXT: %indvars.iv.epil.init = phi i32 [ 0, %entry ], [ %indvars.iv.unr, %for.end.unr-lcssa ] |
| ; CHECK-NEXT: %sum.02.epil.init = phi i32 [ 0, %entry ], [ %sum.02.unr, %for.end.unr-lcssa ] |
| ; CHECK-NEXT: %lcmp.mod1 = icmp ne i32 %xtraiter, 0 |
| ; CHECK-NEXT: call void @llvm.assume(i1 %lcmp.mod1) |
| ; CHECK-NEXT: br label %for.body.epil |
| ; CHECK: for.body.epil: |
| ; CHECK-NEXT: %indvars.iv.epil = phi i32 [ %indvars.iv.next.epil, %for.body.epil ], [ %indvars.iv.epil.init, %for.body.epil.preheader ] |
| ; CHECK-NEXT: %sum.02.epil = phi i32 [ %add.epil, %for.body.epil ], [ %sum.02.epil.init, %for.body.epil.preheader ] |
| ; CHECK-NEXT: %epil.iter = phi i32 [ 0, %for.body.epil.preheader ], [ %epil.iter.next, %for.body.epil ] |
| ; CHECK-NEXT: %arrayidx.in.epil = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv.epil |
| ; CHECK-NEXT: %arrayidx.out.epil = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv.epil |
| ; CHECK-NEXT: %load.epil = load i32, ptr addrspace(1) %arrayidx.in.epil, align 4 |
| ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() #4 |
| ; CHECK-NEXT: %add.epil = add i32 %load.epil, %sum.02.epil |
| ; CHECK-NEXT: store i32 %add.epil, ptr addrspace(1) %arrayidx.out.epil, align 4 |
| ; CHECK-NEXT: %indvars.iv.next.epil = add i32 %indvars.iv.epil, 1 |
| ; CHECK-NEXT: %exitcond.epil = icmp eq i32 %indvars.iv.next.epil, %n |
| ; CHECK-NEXT: %epil.iter.next = add i32 %epil.iter, 1 |
| ; CHECK-NEXT: %epil.iter.cmp = icmp ne i32 %epil.iter.next, %xtraiter |
| ; CHECK-NEXT: br i1 %epil.iter.cmp, label %for.body.epil, label %for.end.epilog-lcssa, !llvm.loop !0 |
| ; CHECK: for.end.epilog-lcssa: |
| ; CHECK-NEXT: br label %for.end |
| ; CHECK: for.end: |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| br label %for.body |
| |
| for.body: |
| %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ] |
| %sum.02 = phi i32 [ %add, %for.body ], [ 0, %entry ] |
| %arrayidx.in = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv |
| %arrayidx.out = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv |
| %load = load i32, ptr addrspace(1) %arrayidx.in |
| call void @llvm.amdgcn.s.barrier() #0 |
| %add = add i32 %load, %sum.02 |
| store i32 %add, ptr addrspace(1) %arrayidx.out |
| %indvars.iv.next = add i32 %indvars.iv, 1 |
| %exitcond = icmp eq i32 %indvars.iv.next, %n |
| br i1 %exitcond, label %for.end, label %for.body |
| |
| for.end: |
| ret void |
| } |
| |
| ; This loop has a convergent barrier and a divergent trip count (derived from |
| ; llvm.amdgcn.workitem.id.x, which is divergent). Since the trip count is |
| ; divergent, runtime unrolling with a remainder must still be blocked. |
| ; |
| define amdgpu_kernel void @runtime_unroll_divergent_trip_count(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture %in) #1 { |
| ; CHECK-LABEL: define amdgpu_kernel void @runtime_unroll_divergent_trip_count(ptr addrspace(1) noalias captures(none) %out, ptr addrspace(1) noalias captures(none) %in) #1 { |
| ; CHECK-NEXT: entry: |
| ; CHECK-NEXT: %tid = call i32 @llvm.amdgcn.workitem.id.x() |
| ; CHECK-NEXT: %n = add i32 %tid, 1 |
| ; CHECK-NEXT: br label %for.body |
| ; CHECK: for.body: |
| ; CHECK-NEXT: %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ] |
| ; CHECK-NEXT: %sum.02 = phi i32 [ %add, %for.body ], [ 0, %entry ] |
| ; CHECK-NEXT: %arrayidx.in = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv |
| ; CHECK-NEXT: %arrayidx.out = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv |
| ; CHECK-NEXT: %load = load i32, ptr addrspace(1) %arrayidx.in, align 4 |
| ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() #4 |
| ; CHECK-NEXT: %add = add i32 %load, %sum.02 |
| ; CHECK-NEXT: store i32 %add, ptr addrspace(1) %arrayidx.out, align 4 |
| ; CHECK-NEXT: %indvars.iv.next = add i32 %indvars.iv, 1 |
| ; CHECK-NEXT: %exitcond = icmp eq i32 %indvars.iv.next, %n |
| ; CHECK-NEXT: br i1 %exitcond, label %for.end, label %for.body |
| ; CHECK: for.end: |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() |
| %n = add i32 %tid, 1 |
| br label %for.body |
| |
| for.body: |
| %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ] |
| %sum.02 = phi i32 [ %add, %for.body ], [ 0, %entry ] |
| %arrayidx.in = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %indvars.iv |
| %arrayidx.out = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %indvars.iv |
| %load = load i32, ptr addrspace(1) %arrayidx.in |
| call void @llvm.amdgcn.s.barrier() #0 |
| %add = add i32 %load, %sum.02 |
| store i32 %add, ptr addrspace(1) %arrayidx.out |
| %indvars.iv.next = add i32 %indvars.iv, 1 |
| %exitcond = icmp eq i32 %indvars.iv.next, %n |
| br i1 %exitcond, label %for.end, label %for.body |
| |
| for.end: |
| ret void |
| } |
| |
| declare i32 @llvm.amdgcn.workitem.id.x() #2 |
| |
| attributes #0 = { nounwind convergent } |
| attributes #1 = { nounwind } |
| attributes #2 = { nounwind readnone willreturn } |