| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s |
| |
| ; This test has an instruction that gets sunk into the loop, that is a |
| ; active.lane.mask operand. (%exitcount.ptrcnt.to.int = ptrtoint). We |
| ; need to make sure it is loop invariant. |
| |
| define i32 @a(i32* readnone %b, i8* %c) { |
| ; CHECK-LABEL: a: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r4, lr} |
| ; CHECK-NEXT: push {r4, lr} |
| ; CHECK-NEXT: cmp r0, r1 |
| ; CHECK-NEXT: it ls |
| ; CHECK-NEXT: popls {r4, pc} |
| ; CHECK-NEXT: .LBB0_1: @ %while.body.preheader |
| ; CHECK-NEXT: subs r4, r0, r1 |
| ; CHECK-NEXT: movs r2, #0 |
| ; CHECK-NEXT: mov r3, r1 |
| ; CHECK-NEXT: dlstp.8 lr, r4 |
| ; CHECK-NEXT: .LBB0_2: @ %vector.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: adds r0, r1, r2 |
| ; CHECK-NEXT: vidup.u8 q0, r0, #1 |
| ; CHECK-NEXT: adds r2, #16 |
| ; CHECK-NEXT: vstrb.8 q0, [r3], #16 |
| ; CHECK-NEXT: letp lr, .LBB0_2 |
| ; CHECK-NEXT: @ %bb.3: @ %while.end |
| ; CHECK-NEXT: pop {r4, pc} |
| entry: |
| %0 = bitcast i32* %b to i8* |
| %cmp3 = icmp ugt i8* %0, %c |
| br i1 %cmp3, label %while.body.preheader, label %while.end |
| |
| while.body.preheader: ; preds = %entry |
| %c5 = ptrtoint i8* %c to i32 |
| %1 = sub i32 0, %c5 |
| %uglygep = getelementptr i8, i8* %0, i32 %1 |
| %exitcount.ptrcnt.to.int = ptrtoint i8* %uglygep to i32 |
| %n.rnd.up = add i32 %exitcount.ptrcnt.to.int, 15 |
| %n.vec = and i32 %n.rnd.up, -16 |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %while.body.preheader |
| %index = phi i32 [ 0, %while.body.preheader ], [ %index.next, %vector.body ] |
| %next.gep = getelementptr i8, i8* %c, i32 %index |
| %2 = or i32 %index, 1 |
| %next.gep7 = getelementptr i8, i8* %c, i32 %2 |
| %3 = or i32 %index, 2 |
| %next.gep8 = getelementptr i8, i8* %c, i32 %3 |
| %4 = or i32 %index, 3 |
| %next.gep9 = getelementptr i8, i8* %c, i32 %4 |
| %5 = or i32 %index, 4 |
| %next.gep10 = getelementptr i8, i8* %c, i32 %5 |
| %6 = or i32 %index, 5 |
| %next.gep11 = getelementptr i8, i8* %c, i32 %6 |
| %7 = or i32 %index, 6 |
| %next.gep12 = getelementptr i8, i8* %c, i32 %7 |
| %8 = or i32 %index, 7 |
| %next.gep13 = getelementptr i8, i8* %c, i32 %8 |
| %9 = or i32 %index, 8 |
| %next.gep14 = getelementptr i8, i8* %c, i32 %9 |
| %10 = or i32 %index, 9 |
| %next.gep15 = getelementptr i8, i8* %c, i32 %10 |
| %11 = or i32 %index, 10 |
| %next.gep16 = getelementptr i8, i8* %c, i32 %11 |
| %12 = or i32 %index, 11 |
| %next.gep17 = getelementptr i8, i8* %c, i32 %12 |
| %13 = or i32 %index, 12 |
| %next.gep18 = getelementptr i8, i8* %c, i32 %13 |
| %14 = or i32 %index, 13 |
| %next.gep19 = getelementptr i8, i8* %c, i32 %14 |
| %15 = or i32 %index, 14 |
| %next.gep20 = getelementptr i8, i8* %c, i32 %15 |
| %16 = or i32 %index, 15 |
| %next.gep21 = getelementptr i8, i8* %c, i32 %16 |
| %17 = insertelement <16 x i8*> poison, i8* %next.gep, i32 0 |
| %18 = insertelement <16 x i8*> %17, i8* %next.gep7, i32 1 |
| %19 = insertelement <16 x i8*> %18, i8* %next.gep8, i32 2 |
| %20 = insertelement <16 x i8*> %19, i8* %next.gep9, i32 3 |
| %21 = insertelement <16 x i8*> %20, i8* %next.gep10, i32 4 |
| %22 = insertelement <16 x i8*> %21, i8* %next.gep11, i32 5 |
| %23 = insertelement <16 x i8*> %22, i8* %next.gep12, i32 6 |
| %24 = insertelement <16 x i8*> %23, i8* %next.gep13, i32 7 |
| %25 = insertelement <16 x i8*> %24, i8* %next.gep14, i32 8 |
| %26 = insertelement <16 x i8*> %25, i8* %next.gep15, i32 9 |
| %27 = insertelement <16 x i8*> %26, i8* %next.gep16, i32 10 |
| %28 = insertelement <16 x i8*> %27, i8* %next.gep17, i32 11 |
| %29 = insertelement <16 x i8*> %28, i8* %next.gep18, i32 12 |
| %30 = insertelement <16 x i8*> %29, i8* %next.gep19, i32 13 |
| %31 = insertelement <16 x i8*> %30, i8* %next.gep20, i32 14 |
| %32 = insertelement <16 x i8*> %31, i8* %next.gep21, i32 15 |
| %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %exitcount.ptrcnt.to.int) |
| %33 = ptrtoint <16 x i8*> %32 to <16 x i32> |
| %34 = trunc <16 x i32> %33 to <16 x i8> |
| %35 = bitcast i8* %next.gep to <16 x i8>* |
| call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %34, <16 x i8>* %35, i32 1, <16 x i1> %active.lane.mask) |
| %index.next = add i32 %index, 16 |
| %36 = icmp eq i32 %index.next, %n.vec |
| br i1 %36, label %while.end, label %vector.body |
| |
| while.end: ; preds = %vector.body, %entry |
| ret i32 undef |
| } |
| |
| declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) |
| declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) |