| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py |
| ; RUN: opt -passes='default<O3>' -rotation-max-header-size=0 -S < %s | FileCheck %s --check-prefix=HOIST |
| ; RUN: opt -passes='default<O3>' -rotation-max-header-size=1 -S < %s | FileCheck %s --check-prefix=HOIST |
| ; RUN: opt -passes='default<O3>' -rotation-max-header-size=2 -S < %s | FileCheck %s --check-prefix=ROTATE |
| ; RUN: opt -passes='default<O3>' -rotation-max-header-size=3 -S < %s | FileCheck %s --check-prefix=ROTATE |
| |
| ; This example is produced from a very basic C code: |
| ; |
| ; void f0(); |
| ; void f1(); |
| ; void f2(); |
| ; |
| ; void loop(int width) { |
| ; if(width < 1) |
| ; return; |
| ; for(int i = 0; i < width - 1; ++i) { |
| ; f0(); |
| ; f1(); |
| ; } |
| ; f0(); |
| ; f2(); |
| ; } |
| |
| ; We have a choice here. We can either |
| ; * hoist the f0() call into loop header, |
| ; * which potentially makes loop rotation unprofitable since loop header might |
| ; have grown above certain threshold, and such unrotated loops will be |
| ; ignored by LoopVectorizer, preventing vectorization |
| ; * or loop rotation will succeed, resulting in some weird PHIs that will also |
| ; harm vectorization |
| ; * or not hoist f0() call before performing loop rotation, |
| ; at the cost of potential code bloat and/or potentially successfully rotating |
| ; the loops, vectorizing them at the cost of compile time. |
| |
| target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" |
| |
| declare void @f0() |
| declare void @f1() |
| declare void @f2() |
| |
| declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) |
| declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) |
| |
| define void @_Z4loopi(i32 %width) { |
| ; HOIST-LABEL: @_Z4loopi( |
| ; HOIST-NEXT: entry: |
| ; HOIST-NEXT: [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1 |
| ; HOIST-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]] |
| ; HOIST: for.cond.preheader: |
| ; HOIST-NEXT: [[SUB:%.*]] = add nsw i32 [[WIDTH]], -1 |
| ; HOIST-NEXT: br label [[FOR_COND:%.*]] |
| ; HOIST: for.cond: |
| ; HOIST-NEXT: [[I_0:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ] |
| ; HOIST-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[I_0]], [[SUB]] |
| ; HOIST-NEXT: tail call void @f0() |
| ; HOIST-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] |
| ; HOIST: for.cond.cleanup: |
| ; HOIST-NEXT: tail call void @f2() |
| ; HOIST-NEXT: br label [[RETURN]] |
| ; HOIST: for.body: |
| ; HOIST-NEXT: tail call void @f1() |
| ; HOIST-NEXT: [[INC]] = add nuw i32 [[I_0]], 1 |
| ; HOIST-NEXT: br label [[FOR_COND]] |
| ; HOIST: return: |
| ; HOIST-NEXT: ret void |
| ; |
| ; ROTATE-LABEL: @_Z4loopi( |
| ; ROTATE-NEXT: entry: |
| ; ROTATE-NEXT: [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1 |
| ; ROTATE-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]] |
| ; ROTATE: for.cond.preheader: |
| ; ROTATE-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[WIDTH]], 1 |
| ; ROTATE-NEXT: br i1 [[CMP13_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]] |
| ; ROTATE: for.body.preheader: |
| ; ROTATE-NEXT: [[TMP0:%.*]] = add i32 [[WIDTH]], -2 |
| ; ROTATE-NEXT: br label [[FOR_BODY:%.*]] |
| ; ROTATE: for.cond.cleanup: |
| ; ROTATE-NEXT: tail call void @f0() |
| ; ROTATE-NEXT: tail call void @f2() |
| ; ROTATE-NEXT: br label [[RETURN]] |
| ; ROTATE: for.body: |
| ; ROTATE-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] |
| ; ROTATE-NEXT: tail call void @f0() |
| ; ROTATE-NEXT: tail call void @f1() |
| ; ROTATE-NEXT: [[INC]] = add nuw nsw i32 [[I_04]], 1 |
| ; ROTATE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[I_04]], [[TMP0]] |
| ; ROTATE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] |
| ; ROTATE: return: |
| ; ROTATE-NEXT: ret void |
| ; |
| entry: |
| %width.addr = alloca i32, align 4 |
| %i = alloca i32, align 4 |
| store i32 %width, ptr %width.addr, align 4 |
| %i1 = load i32, ptr %width.addr, align 4 |
| %cmp = icmp slt i32 %i1, 1 |
| br i1 %cmp, label %if.then, label %if.end |
| |
| if.then: |
| br label %return |
| |
| if.end: |
| call void @llvm.lifetime.start.p0(i64 4, ptr %i) |
| store i32 0, ptr %i, align 4 |
| br label %for.cond |
| |
| for.cond: |
| %i3 = load i32, ptr %i, align 4 |
| %i4 = load i32, ptr %width.addr, align 4 |
| %sub = sub nsw i32 %i4, 1 |
| %cmp1 = icmp slt i32 %i3, %sub |
| br i1 %cmp1, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: |
| call void @llvm.lifetime.end.p0(i64 4, ptr %i) |
| br label %for.end |
| |
| for.body: |
| call void @f0() |
| call void @f1() |
| br label %for.inc |
| |
| for.inc: |
| %i6 = load i32, ptr %i, align 4 |
| %inc = add nsw i32 %i6, 1 |
| store i32 %inc, ptr %i, align 4 |
| br label %for.cond |
| |
| for.end: |
| call void @f0() |
| call void @f2() |
| br label %return |
| |
| return: |
| ret void |
| } |