| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s |
| |
| define void @vaddq(i32* %x, i32* %y, i32 %n) { |
| ; CHECK-LABEL: vaddq: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB0_1: @ %for.body.preheader |
| ; CHECK-NEXT: movs r3, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB0_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 |
| ; CHECK-NEXT: vadd.i32 q0, q0, r3 |
| ; CHECK-NEXT: vstrw.32 q0, [r1], #16 |
| ; CHECK-NEXT: letp lr, .LBB0_2 |
| ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %n, 0 |
| br i1 %cmp11, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| ret void |
| |
| for.body: ; preds = %entry, %for.body |
| %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ] |
| %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ] |
| %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) |
| %1 = bitcast i32* %x.addr.014 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4 |
| %3 = add <4 x i32> %2, <i32 10, i32 10, i32 10, i32 10> |
| %4 = bitcast i32* %y.addr.013 to <4 x i32>* |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0) |
| %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4 |
| %sub = add nsw i32 %i.012, -4 |
| %cmp = icmp sgt i32 %i.012, 4 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @vadd(i32* %s1, i32 %c0, i32 %N) { |
| ; CHECK-LABEL: vadd: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB1_1: @ %while.body.lr.ph |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB1_2: @ %while.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0] |
| ; CHECK-NEXT: vadd.i32 q0, q0, r1 |
| ; CHECK-NEXT: vstrw.32 q0, [r0], #16 |
| ; CHECK-NEXT: letp lr, .LBB1_2 |
| ; CHECK-NEXT: @ %bb.3: @ %while.end |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %N, 0 |
| br i1 %cmp11, label %while.body.lr.ph, label %while.end |
| |
| while.body.lr.ph: ; preds = %entry |
| %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 |
| %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer |
| br label %while.body |
| |
| while.body: ; preds = %while.body.lr.ph, %while.body |
| %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] |
| %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) |
| %1 = bitcast i32* %s1.addr.013 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %3 = tail call <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2) |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) |
| %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 |
| %sub = add nsw i32 %N.addr.012, -4 |
| %cmp = icmp sgt i32 %N.addr.012, 4 |
| br i1 %cmp, label %while.body, label %while.end |
| |
| while.end: ; preds = %while.body, %entry |
| ret void |
| } |
| |
| define void @vsubq(i32* %x, i32* %y, i32 %n) { |
| ; CHECK-LABEL: vsubq: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB2_1: @ %for.body.preheader |
| ; CHECK-NEXT: movs r3, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB2_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 |
| ; CHECK-NEXT: vsub.i32 q0, q0, r3 |
| ; CHECK-NEXT: vstrw.32 q0, [r1], #16 |
| ; CHECK-NEXT: letp lr, .LBB2_2 |
| ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %n, 0 |
| br i1 %cmp11, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| ret void |
| |
| for.body: ; preds = %entry, %for.body |
| %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ] |
| %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ] |
| %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) |
| %1 = bitcast i32* %x.addr.014 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4 |
| %3 = sub <4 x i32> %2, <i32 10, i32 10, i32 10, i32 10> |
| %4 = bitcast i32* %y.addr.013 to <4 x i32>* |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0) |
| %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4 |
| %sub = add nsw i32 %i.012, -4 |
| %cmp = icmp sgt i32 %i.012, 4 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @vsub(i32* %s1, i32 %N) { |
| ; CHECK-LABEL: vsub: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r1, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB3_1: @ %while.body.preheader |
| ; CHECK-NEXT: movs r2, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r1 |
| ; CHECK-NEXT: .LBB3_2: @ %while.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0] |
| ; CHECK-NEXT: vsub.i32 q0, q0, r2 |
| ; CHECK-NEXT: vstrw.32 q0, [r0], #16 |
| ; CHECK-NEXT: letp lr, .LBB3_2 |
| ; CHECK-NEXT: @ %bb.3: @ %while.end |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %N, 0 |
| br i1 %cmp11, label %while.body.lr.ph, label %while.end |
| |
| while.body.lr.ph: ; preds = %entry |
| br label %while.body |
| |
| while.body: ; preds = %while.body.lr.ph, %while.body |
| %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] |
| %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) |
| %1 = bitcast i32* %s1.addr.013 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %3 = tail call <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, <4 x i1> %0, <4 x i32> %2) |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) |
| %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 |
| %sub = add nsw i32 %N.addr.012, -4 |
| %cmp = icmp sgt i32 %N.addr.012, 4 |
| br i1 %cmp, label %while.body, label %while.end |
| |
| while.end: ; preds = %while.body, %entry |
| ret void |
| } |
| |
| define void @vmulq(i32* %x, i32* %y, i32 %n) { |
| ; CHECK-LABEL: vmulq: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB4_1: @ %for.body.preheader |
| ; CHECK-NEXT: movs r3, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB4_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 |
| ; CHECK-NEXT: vmul.i32 q0, q0, r3 |
| ; CHECK-NEXT: vstrw.32 q0, [r1], #16 |
| ; CHECK-NEXT: letp lr, .LBB4_2 |
| ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %n, 0 |
| br i1 %cmp11, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| ret void |
| |
| for.body: ; preds = %entry, %for.body |
| %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ] |
| %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ] |
| %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) |
| %1 = bitcast i32* %x.addr.014 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4 |
| %3 = mul <4 x i32> %2, <i32 10, i32 10, i32 10, i32 10> |
| %4 = bitcast i32* %y.addr.013 to <4 x i32>* |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0) |
| %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4 |
| %sub = add nsw i32 %i.012, -4 |
| %cmp = icmp sgt i32 %i.012, 4 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @vmul(i32* %s1, i32 %N) { |
| ; CHECK-LABEL: vmul: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r1, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB5_1: @ %while.body.preheader |
| ; CHECK-NEXT: movs r2, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r1 |
| ; CHECK-NEXT: .LBB5_2: @ %while.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0] |
| ; CHECK-NEXT: vmul.i32 q0, q0, r2 |
| ; CHECK-NEXT: vstrw.32 q0, [r0], #16 |
| ; CHECK-NEXT: letp lr, .LBB5_2 |
| ; CHECK-NEXT: @ %bb.3: @ %while.end |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %N, 0 |
| br i1 %cmp11, label %while.body.lr.ph, label %while.end |
| |
| while.body.lr.ph: ; preds = %entry |
| br label %while.body |
| |
| while.body: ; preds = %while.body.lr.ph, %while.body |
| %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] |
| %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) |
| %1 = bitcast i32* %s1.addr.013 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %3 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, <4 x i1> %0, <4 x i32> %2) |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) |
| %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 |
| %sub = add nsw i32 %N.addr.012, -4 |
| %cmp = icmp sgt i32 %N.addr.012, 4 |
| br i1 %cmp, label %while.body, label %while.end |
| |
| while.end: ; preds = %while.body, %entry |
| ret void |
| } |
| |
| define void @vqaddq(i32* %x, i32* %y, i32 %n) { |
| ; CHECK-LABEL: vqaddq: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB6_1: @ %for.body.preheader |
| ; CHECK-NEXT: movs r3, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB6_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 |
| ; CHECK-NEXT: vqadd.s32 q0, q0, r3 |
| ; CHECK-NEXT: vstrw.32 q0, [r1], #16 |
| ; CHECK-NEXT: letp lr, .LBB6_2 |
| ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %n, 0 |
| br i1 %cmp11, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| ret void |
| |
| for.body: ; preds = %entry, %for.body |
| %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ] |
| %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ] |
| %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) |
| %1 = bitcast i32* %x.addr.014 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4 |
| %3 = tail call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>) |
| %4 = bitcast i32* %y.addr.013 to <4 x i32>* |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0) |
| %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4 |
| %sub = add nsw i32 %i.012, -4 |
| %cmp = icmp sgt i32 %i.012, 4 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @vqaddqu(i32* %x, i32* %y, i32 %n) { |
| ; CHECK-LABEL: vqaddqu: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB7_1: @ %for.body.preheader |
| ; CHECK-NEXT: movs r3, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB7_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 |
| ; CHECK-NEXT: vqadd.u32 q0, q0, r3 |
| ; CHECK-NEXT: vstrw.32 q0, [r1], #16 |
| ; CHECK-NEXT: letp lr, .LBB7_2 |
| ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %n, 0 |
| br i1 %cmp11, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| ret void |
| |
| for.body: ; preds = %entry, %for.body |
| %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ] |
| %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ] |
| %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) |
| %1 = bitcast i32* %x.addr.014 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4 |
| %3 = tail call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>) |
| %4 = bitcast i32* %y.addr.013 to <4 x i32>* |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0) |
| %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4 |
| %sub = add nsw i32 %i.012, -4 |
| %cmp = icmp sgt i32 %i.012, 4 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @vqadd(i32* %s1, i32 %N) { |
| ; CHECK-LABEL: vqadd: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r1, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB8_1: @ %while.body.preheader |
| ; CHECK-NEXT: movs r2, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r1 |
| ; CHECK-NEXT: .LBB8_2: @ %while.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0] |
| ; CHECK-NEXT: vqadd.s32 q0, q0, r2 |
| ; CHECK-NEXT: vstrw.32 q0, [r0], #16 |
| ; CHECK-NEXT: letp lr, .LBB8_2 |
| ; CHECK-NEXT: @ %bb.3: @ %while.end |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %N, 0 |
| br i1 %cmp11, label %while.body.lr.ph, label %while.end |
| |
| while.body.lr.ph: ; preds = %entry |
| br label %while.body |
| |
| while.body: ; preds = %while.body.lr.ph, %while.body |
| %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] |
| %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) |
| %1 = bitcast i32* %s1.addr.013 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %3 = tail call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, i32 0, <4 x i1> %0, <4 x i32> %2) |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) |
| %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 |
| %sub = add nsw i32 %N.addr.012, -4 |
| %cmp = icmp sgt i32 %N.addr.012, 4 |
| br i1 %cmp, label %while.body, label %while.end |
| |
| while.end: ; preds = %while.body, %entry |
| ret void |
| } |
| |
| define void @vqsubq(i32* %x, i32* %y, i32 %n) { |
| ; CHECK-LABEL: vqsubq: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB9_1: @ %for.body.preheader |
| ; CHECK-NEXT: movs r3, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB9_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 |
| ; CHECK-NEXT: vqsub.s32 q0, q0, r3 |
| ; CHECK-NEXT: vstrw.32 q0, [r1], #16 |
| ; CHECK-NEXT: letp lr, .LBB9_2 |
| ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %n, 0 |
| br i1 %cmp11, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| ret void |
| |
| for.body: ; preds = %entry, %for.body |
| %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ] |
| %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ] |
| %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) |
| %1 = bitcast i32* %x.addr.014 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4 |
| %3 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>) |
| %4 = bitcast i32* %y.addr.013 to <4 x i32>* |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0) |
| %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4 |
| %sub = add nsw i32 %i.012, -4 |
| %cmp = icmp sgt i32 %i.012, 4 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @vqsubqu(i32* %x, i32* %y, i32 %n) { |
| ; CHECK-LABEL: vqsubqu: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB10_1: @ %for.body.preheader |
| ; CHECK-NEXT: movs r3, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB10_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 |
| ; CHECK-NEXT: vqsub.u32 q0, q0, r3 |
| ; CHECK-NEXT: vstrw.32 q0, [r1], #16 |
| ; CHECK-NEXT: letp lr, .LBB10_2 |
| ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %n, 0 |
| br i1 %cmp11, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| ret void |
| |
| for.body: ; preds = %entry, %for.body |
| %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ] |
| %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ] |
| %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) |
| %1 = bitcast i32* %x.addr.014 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4 |
| %3 = tail call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>) |
| %4 = bitcast i32* %y.addr.013 to <4 x i32>* |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0) |
| %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4 |
| %sub = add nsw i32 %i.012, -4 |
| %cmp = icmp sgt i32 %i.012, 4 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @vqsub(i32* %s1, i32 %N) { |
| ; CHECK-LABEL: vqsub: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r1, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB11_1: @ %while.body.preheader |
| ; CHECK-NEXT: movs r2, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r1 |
| ; CHECK-NEXT: .LBB11_2: @ %while.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0] |
| ; CHECK-NEXT: vqsub.s32 q0, q0, r2 |
| ; CHECK-NEXT: vstrw.32 q0, [r0], #16 |
| ; CHECK-NEXT: letp lr, .LBB11_2 |
| ; CHECK-NEXT: @ %bb.3: @ %while.end |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %N, 0 |
| br i1 %cmp11, label %while.body.lr.ph, label %while.end |
| |
| while.body.lr.ph: ; preds = %entry |
| br label %while.body |
| |
| while.body: ; preds = %while.body.lr.ph, %while.body |
| %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] |
| %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) |
| %1 = bitcast i32* %s1.addr.013 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %3 = tail call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, i32 0, <4 x i1> %0, <4 x i32> %2) |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) |
| %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 |
| %sub = add nsw i32 %N.addr.012, -4 |
| %cmp = icmp sgt i32 %N.addr.012, 4 |
| br i1 %cmp, label %while.body, label %while.end |
| |
| while.end: ; preds = %while.body, %entry |
| ret void |
| } |
| |
| define void @vhaddq(i32* %x, i32* %y, i32 %n) { |
| ; CHECK-LABEL: vhaddq: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB12_1: @ %for.body.preheader |
| ; CHECK-NEXT: movs r3, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB12_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 |
| ; CHECK-NEXT: vhadd.s32 q0, q0, r3 |
| ; CHECK-NEXT: vstrw.32 q0, [r1], #16 |
| ; CHECK-NEXT: letp lr, .LBB12_2 |
| ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %n, 0 |
| br i1 %cmp11, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| ret void |
| |
| for.body: ; preds = %entry, %for.body |
| %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ] |
| %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ] |
| %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) |
| %1 = bitcast i32* %x.addr.014 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4 |
| %3 = tail call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, i32 0) |
| %4 = bitcast i32* %y.addr.013 to <4 x i32>* |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0) |
| %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4 |
| %sub = add nsw i32 %i.012, -4 |
| %cmp = icmp sgt i32 %i.012, 4 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @vhadd(i32* %s1, i32 %N) { |
| ; CHECK-LABEL: vhadd: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r1, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB13_1: @ %while.body.preheader |
| ; CHECK-NEXT: movs r2, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r1 |
| ; CHECK-NEXT: .LBB13_2: @ %while.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0] |
| ; CHECK-NEXT: vhadd.s32 q0, q0, r2 |
| ; CHECK-NEXT: vstrw.32 q0, [r0], #16 |
| ; CHECK-NEXT: letp lr, .LBB13_2 |
| ; CHECK-NEXT: @ %bb.3: @ %while.end |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %N, 0 |
| br i1 %cmp11, label %while.body.lr.ph, label %while.end |
| |
| while.body.lr.ph: ; preds = %entry |
| br label %while.body |
| |
| while.body: ; preds = %while.body.lr.ph, %while.body |
| %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] |
| %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) |
| %1 = bitcast i32* %s1.addr.013 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %3 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, i32 0, <4 x i1> %0, <4 x i32> %2) |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) |
| %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 |
| %sub = add nsw i32 %N.addr.012, -4 |
| %cmp = icmp sgt i32 %N.addr.012, 4 |
| br i1 %cmp, label %while.body, label %while.end |
| |
| while.end: ; preds = %while.body, %entry |
| ret void |
| } |
| |
| define void @vhsubq(i32* %x, i32* %y, i32 %n) { |
| ; CHECK-LABEL: vhsubq: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB14_1: @ %for.body.preheader |
| ; CHECK-NEXT: movs r3, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB14_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 |
| ; CHECK-NEXT: vhsub.s32 q0, q0, r3 |
| ; CHECK-NEXT: vstrw.32 q0, [r1], #16 |
| ; CHECK-NEXT: letp lr, .LBB14_2 |
| ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %n, 0 |
| br i1 %cmp11, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| ret void |
| |
| for.body: ; preds = %entry, %for.body |
| %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ] |
| %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ] |
| %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) |
| %1 = bitcast i32* %x.addr.014 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4 |
| %3 = tail call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, i32 0) |
| %4 = bitcast i32* %y.addr.013 to <4 x i32>* |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0) |
| %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4 |
| %sub = add nsw i32 %i.012, -4 |
| %cmp = icmp sgt i32 %i.012, 4 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @vhsub(i32* %s1, i32 %N) { |
| ; CHECK-LABEL: vhsub: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r1, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB15_1: @ %while.body.preheader |
| ; CHECK-NEXT: movs r2, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r1 |
| ; CHECK-NEXT: .LBB15_2: @ %while.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0] |
| ; CHECK-NEXT: vhsub.s32 q0, q0, r2 |
| ; CHECK-NEXT: vstrw.32 q0, [r0], #16 |
| ; CHECK-NEXT: letp lr, .LBB15_2 |
| ; CHECK-NEXT: @ %bb.3: @ %while.end |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %N, 0 |
| br i1 %cmp11, label %while.body.lr.ph, label %while.end |
| |
| while.body.lr.ph: ; preds = %entry |
| br label %while.body |
| |
| while.body: ; preds = %while.body.lr.ph, %while.body |
| %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] |
| %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) |
| %1 = bitcast i32* %s1.addr.013 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %3 = tail call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, i32 0, <4 x i1> %0, <4 x i32> %2) |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) |
| %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 |
| %sub = add nsw i32 %N.addr.012, -4 |
| %cmp = icmp sgt i32 %N.addr.012, 4 |
| br i1 %cmp, label %while.body, label %while.end |
| |
| while.end: ; preds = %while.body, %entry |
| ret void |
| } |
| |
| define void @vqdmullbq(i32* %x, i32* %y, i32 %n) { |
| ; CHECK-LABEL: vqdmullbq: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB16_1: @ %for.body.preheader |
| ; CHECK-NEXT: movs r3, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB16_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 |
| ; CHECK-NEXT: vqdmullb.s32 q1, q0, r3 |
| ; CHECK-NEXT: vstrw.32 q1, [r1], #16 |
| ; CHECK-NEXT: letp lr, .LBB16_2 |
| ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %n, 0 |
| br i1 %cmp11, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| ret void |
| |
| for.body: ; preds = %entry, %for.body |
| %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ] |
| %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ] |
| %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) |
| %1 = bitcast i32* %x.addr.014 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4 |
| %3 = tail call <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, i32 0) |
| %4 = bitcast <2 x i64> %3 to <4 x i32> |
| %5 = bitcast i32* %y.addr.013 to <4 x i32>* |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %4, <4 x i32>* %5, i32 4, <4 x i1> %0) |
| %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4 |
| %sub = add nsw i32 %i.012, -4 |
| %cmp = icmp sgt i32 %i.012, 4 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| } |
| |
| |
| define void @vqdmull(i32* %s1, i32 %N) { |
| ; CHECK-LABEL: vqdmull: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r1, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB17_1: @ %while.body.preheader |
| ; CHECK-NEXT: movs r2, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r1 |
| ; CHECK-NEXT: .LBB17_2: @ %while.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0] |
| ; CHECK-NEXT: vqdmullb.s16 q0, q0, r2 |
| ; CHECK-NEXT: vstrw.32 q0, [r0], #16 |
| ; CHECK-NEXT: letp lr, .LBB17_2 |
| ; CHECK-NEXT: @ %bb.3: @ %while.end |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %N, 0 |
| br i1 %cmp11, label %while.body.lr.ph, label %while.end |
| |
| while.body.lr.ph: ; preds = %entry |
| br label %while.body |
| |
| while.body: ; preds = %while.body.lr.ph, %while.body |
| %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] |
| %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) |
| %1 = bitcast i32* %s1.addr.013 to <4 x i16>* |
| %2 = tail call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %1, i32 2, <4 x i1> %0, <4 x i16> zeroinitializer) |
| %3 = sext <4 x i16> %2 to <4 x i32> |
| %4 = bitcast <4 x i32> %3 to <8 x i16> |
| %5 = tail call <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16> %4, <8 x i16> <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>, i32 0, <4 x i1> %0, <4 x i32> %3) |
| %6 = bitcast i32* %s1.addr.013 to <4 x i32>* |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %5, <4 x i32>* %6, i32 4, <4 x i1> %0) |
| %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 |
| %sub = add nsw i32 %N.addr.012, -4 |
| %cmp = icmp sgt i32 %N.addr.012, 4 |
| br i1 %cmp, label %while.body, label %while.end |
| |
| while.end: ; preds = %while.body, %entry |
| ret void |
| } |
| |
| define void @vqdmulhq(i32* %x, i32* %y, i32 %n) { |
| ; CHECK-LABEL: vqdmulhq: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB18_1: @ %for.body.preheader |
| ; CHECK-NEXT: movs r3, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB18_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 |
| ; CHECK-NEXT: vqdmulh.s32 q0, q0, r3 |
| ; CHECK-NEXT: vstrw.32 q0, [r1], #16 |
| ; CHECK-NEXT: letp lr, .LBB18_2 |
| ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %n, 0 |
| br i1 %cmp11, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| ret void |
| |
| for.body: ; preds = %entry, %for.body |
| %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ] |
| %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ] |
| %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) |
| %1 = bitcast i32* %x.addr.014 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4 |
| %3 = tail call <4 x i32> @llvm.arm.mve.vqdmulh.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>) |
| %4 = bitcast i32* %y.addr.013 to <4 x i32>* |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0) |
| %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4 |
| %sub = add nsw i32 %i.012, -4 |
| %cmp = icmp sgt i32 %i.012, 4 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @vqdmulh(i32* %s1, i32 %N) { |
| ; CHECK-LABEL: vqdmulh: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r1, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB19_1: @ %while.body.preheader |
| ; CHECK-NEXT: movs r2, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r1 |
| ; CHECK-NEXT: .LBB19_2: @ %while.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0] |
| ; CHECK-NEXT: vqdmulh.s32 q0, q0, r2 |
| ; CHECK-NEXT: vstrw.32 q0, [r0], #16 |
| ; CHECK-NEXT: letp lr, .LBB19_2 |
| ; CHECK-NEXT: @ %bb.3: @ %while.end |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %N, 0 |
| br i1 %cmp11, label %while.body.lr.ph, label %while.end |
| |
| while.body.lr.ph: ; preds = %entry |
| br label %while.body |
| |
| while.body: ; preds = %while.body.lr.ph, %while.body |
| %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] |
| %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) |
| %1 = bitcast i32* %s1.addr.013 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %3 = tail call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, <4 x i1> %0, <4 x i32> %2) |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) |
| %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 |
| %sub = add nsw i32 %N.addr.012, -4 |
| %cmp = icmp sgt i32 %N.addr.012, 4 |
| br i1 %cmp, label %while.body, label %while.end |
| |
| while.end: ; preds = %while.body, %entry |
| ret void |
| } |
| |
| define void @vqrdmulhq(i32* %x, i32* %y, i32 %n) { |
| ; CHECK-LABEL: vqrdmulhq: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB20_1: @ %for.body.preheader |
| ; CHECK-NEXT: movs r3, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB20_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 |
| ; CHECK-NEXT: vqrdmulh.s32 q0, q0, r3 |
| ; CHECK-NEXT: vstrw.32 q0, [r1], #16 |
| ; CHECK-NEXT: letp lr, .LBB20_2 |
| ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %n, 0 |
| br i1 %cmp11, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| ret void |
| |
| for.body: ; preds = %entry, %for.body |
| %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ] |
| %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ] |
| %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) |
| %1 = bitcast i32* %x.addr.014 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4 |
| %3 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>) |
| %4 = bitcast i32* %y.addr.013 to <4 x i32>* |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0) |
| %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4 |
| %sub = add nsw i32 %i.012, -4 |
| %cmp = icmp sgt i32 %i.012, 4 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @vqrdmulh(i32* %s1, i32 %N) { |
| ; CHECK-LABEL: vqrdmulh: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r1, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB21_1: @ %while.body.preheader |
| ; CHECK-NEXT: movs r2, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r1 |
| ; CHECK-NEXT: .LBB21_2: @ %while.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0] |
| ; CHECK-NEXT: vqrdmulh.s32 q0, q0, r2 |
| ; CHECK-NEXT: vstrw.32 q0, [r0], #16 |
| ; CHECK-NEXT: letp lr, .LBB21_2 |
| ; CHECK-NEXT: @ %bb.3: @ %while.end |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %N, 0 |
| br i1 %cmp11, label %while.body.lr.ph, label %while.end |
| |
| while.body.lr.ph: ; preds = %entry |
| br label %while.body |
| |
| while.body: ; preds = %while.body.lr.ph, %while.body |
| %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] |
| %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) |
| %1 = bitcast i32* %s1.addr.013 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %3 = tail call <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, <4 x i1> %0, <4 x i32> %2) |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) |
| %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 |
| %sub = add nsw i32 %N.addr.012, -4 |
| %cmp = icmp sgt i32 %N.addr.012, 4 |
| br i1 %cmp, label %while.body, label %while.end |
| |
| while.end: ; preds = %while.body, %entry |
| ret void |
| } |
| |
| define void @vmlaq(i32* %x, i32* %y, i32 %n) { |
| ; CHECK-LABEL: vmlaq: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB22_1: @ %for.body.preheader |
| ; CHECK-NEXT: movs r3, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB22_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r1] |
| ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 |
| ; CHECK-NEXT: vmla.i32 q1, q0, r3 |
| ; CHECK-NEXT: vstrw.32 q1, [r1], #16 |
| ; CHECK-NEXT: letp lr, .LBB22_2 |
| ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp14 = icmp sgt i32 %n, 0 |
| br i1 %cmp14, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| ret void |
| |
| for.body: ; preds = %entry, %for.body |
| %x.addr.017 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ] |
| %y.addr.016 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ] |
| %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015) |
| %1 = bitcast i32* %x.addr.017 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %add.ptr = getelementptr inbounds i32, i32* %x.addr.017, i32 4 |
| %3 = bitcast i32* %y.addr.016 to <4 x i32>* |
| %4 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %5 = mul <4 x i32> %4, <i32 10, i32 10, i32 10, i32 10> |
| %6 = add <4 x i32> %5, %2 |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %6, <4 x i32>* %3, i32 4, <4 x i1> %0) |
| %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.016, i32 4 |
| %sub = add nsw i32 %i.015, -4 |
| %cmp = icmp sgt i32 %i.015, 4 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @vmlaqp(i32* %x, i32* %y, i32 %n) { |
| ; CHECK-LABEL: vmlaqp: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB23_1: @ %for.body.preheader |
| ; CHECK-NEXT: movs r3, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB23_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r1] |
| ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 |
| ; CHECK-NEXT: vmla.i32 q1, q0, r3 |
| ; CHECK-NEXT: vstrw.32 q1, [r1], #16 |
| ; CHECK-NEXT: letp lr, .LBB23_2 |
| ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp15 = icmp sgt i32 %n, 0 |
| br i1 %cmp15, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| ret void |
| |
| for.body: ; preds = %entry, %for.body |
| %x.addr.018 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ] |
| %y.addr.017 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ] |
| %i.016 = phi i32 [ %sub, %for.body ], [ %n, %entry ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.016) |
| %1 = bitcast i32* %x.addr.018 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %add.ptr = getelementptr inbounds i32, i32* %x.addr.018, i32 4 |
| %3 = bitcast i32* %y.addr.017 to <4 x i32>* |
| %4 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %5 = tail call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %4, i32 10, <4 x i1> %0) |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %5, <4 x i32>* %3, i32 4, <4 x i1> %0) |
| %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.017, i32 4 |
| %sub = add nsw i32 %i.016, -4 |
| %cmp = icmp sgt i32 %i.016, 4 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @vmlasq(i32* %x, i32* %y, i32 %n) { |
| ; CHECK-LABEL: vmlasq: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB24_1: @ %for.body.preheader |
| ; CHECK-NEXT: movs r3, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB24_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 |
| ; CHECK-NEXT: vldrw.u32 q1, [r1] |
| ; CHECK-NEXT: vmlas.i32 q1, q0, r3 |
| ; CHECK-NEXT: vstrw.32 q1, [r1], #16 |
| ; CHECK-NEXT: letp lr, .LBB24_2 |
| ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp14 = icmp sgt i32 %n, 0 |
| br i1 %cmp14, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| ret void |
| |
| for.body: ; preds = %entry, %for.body |
| %x.addr.017 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ] |
| %y.addr.016 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ] |
| %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015) |
| %1 = bitcast i32* %x.addr.017 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %add.ptr = getelementptr inbounds i32, i32* %x.addr.017, i32 4 |
| %3 = bitcast i32* %y.addr.016 to <4 x i32>* |
| %4 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %5 = mul <4 x i32> %4, %2 |
| %6 = add <4 x i32> %5, <i32 10, i32 10, i32 10, i32 10> |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %6, <4 x i32>* %3, i32 4, <4 x i1> %0) |
| %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.016, i32 4 |
| %sub = add nsw i32 %i.015, -4 |
| %cmp = icmp sgt i32 %i.015, 4 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @vmlasqp(i32* %x, i32* %y, i32 %n) { |
| ; CHECK-LABEL: vmlasqp: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB25_1: @ %for.body.preheader |
| ; CHECK-NEXT: movs r3, #10 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB25_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r1] |
| ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 |
| ; CHECK-NEXT: vmlas.i32 q1, q0, r3 |
| ; CHECK-NEXT: vstrw.32 q1, [r1], #16 |
| ; CHECK-NEXT: letp lr, .LBB25_2 |
| ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp15 = icmp sgt i32 %n, 0 |
| br i1 %cmp15, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| ret void |
| |
| for.body: ; preds = %entry, %for.body |
| %x.addr.018 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ] |
| %y.addr.017 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ] |
| %i.016 = phi i32 [ %sub, %for.body ], [ %n, %entry ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.016) |
| %1 = bitcast i32* %x.addr.018 to <4 x i32>* |
| %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %add.ptr = getelementptr inbounds i32, i32* %x.addr.018, i32 4 |
| %3 = bitcast i32* %y.addr.017 to <4 x i32>* |
| %4 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) |
| %5 = tail call <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %4, i32 10, <4 x i1> %0) |
| tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %5, <4 x i32>* %3, i32 4, <4 x i1> %0) |
| %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.017, i32 4 |
| %sub = add nsw i32 %i.016, -4 |
| %cmp = icmp sgt i32 %i.016, 4 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @vaddqf(float* %x, float* %y, i32 %n) { |
| ; CHECK-LABEL: vaddqf: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB26_1: @ %for.body.preheader |
| ; CHECK-NEXT: vmov.f32 q0, #1.000000e+01 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB26_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 |
| ; CHECK-NEXT: vadd.f32 q1, q1, q0 |
| ; CHECK-NEXT: vstrw.32 q1, [r1], #16 |
| ; CHECK-NEXT: letp lr, .LBB26_2 |
| ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %n, 0 |
| br i1 %cmp11, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| ret void |
| |
| for.body: ; preds = %entry, %for.body |
| %x.addr.014 = phi float* [ %add.ptr, %for.body ], [ %x, %entry ] |
| %y.addr.013 = phi float* [ %add.ptr1, %for.body ], [ %y, %entry ] |
| %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) |
| %1 = bitcast float* %x.addr.014 to <4 x float>* |
| %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) |
| %add.ptr = getelementptr inbounds float, float* %x.addr.014, i32 4 |
| %3 = fadd fast <4 x float> %2, <float 10.0, float 10.0, float 10.0, float 10.0> |
| %4 = bitcast float* %y.addr.013 to <4 x float>* |
| tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %4, i32 4, <4 x i1> %0) |
| %add.ptr1 = getelementptr inbounds float, float* %y.addr.013, i32 4 |
| %sub = add nsw i32 %i.012, -4 |
| %cmp = icmp sgt i32 %i.012, 4 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @vaddf(float* %s1, i32 %N) { |
| ; CHECK-LABEL: vaddf: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r1, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB27_1: @ %while.body.preheader |
| ; CHECK-NEXT: movs r2, #0 |
| ; CHECK-NEXT: movt r2, #16672 |
| ; CHECK-NEXT: dlstp.32 lr, r1 |
| ; CHECK-NEXT: .LBB27_2: @ %while.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0] |
| ; CHECK-NEXT: vadd.f32 q0, q0, r2 |
| ; CHECK-NEXT: vstrw.32 q0, [r0], #16 |
| ; CHECK-NEXT: letp lr, .LBB27_2 |
| ; CHECK-NEXT: @ %bb.3: @ %while.end |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %N, 0 |
| br i1 %cmp11, label %while.body.lr.ph, label %while.end |
| |
| while.body.lr.ph: ; preds = %entry |
| br label %while.body |
| |
| while.body: ; preds = %while.body.lr.ph, %while.body |
| %s1.addr.013 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] |
| %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) |
| %1 = bitcast float* %s1.addr.013 to <4 x float>* |
| %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) |
| %3 = tail call fast <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> <float 10.0, float 10.0, float 10.0, float 10.0>, <4 x i1> %0, <4 x float> %2) |
| tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %1, i32 4, <4 x i1> %0) |
| %add.ptr = getelementptr inbounds float, float* %s1.addr.013, i32 4 |
| %sub = add nsw i32 %N.addr.012, -4 |
| %cmp = icmp sgt i32 %N.addr.012, 4 |
| br i1 %cmp, label %while.body, label %while.end |
| |
| while.end: ; preds = %while.body, %entry |
| ret void |
| } |
| |
| define void @vsubqf(float* %x, float* %y, i32 %n) { |
| ; CHECK-LABEL: vsubqf: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB28_1: @ %for.body.preheader |
| ; CHECK-NEXT: vmov.f32 q0, #-1.000000e+01 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB28_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 |
| ; CHECK-NEXT: vadd.f32 q1, q1, q0 |
| ; CHECK-NEXT: vstrw.32 q1, [r1], #16 |
| ; CHECK-NEXT: letp lr, .LBB28_2 |
| ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %n, 0 |
| br i1 %cmp11, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| ret void |
| |
| for.body: ; preds = %entry, %for.body |
| %x.addr.014 = phi float* [ %add.ptr, %for.body ], [ %x, %entry ] |
| %y.addr.013 = phi float* [ %add.ptr1, %for.body ], [ %y, %entry ] |
| %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) |
| %1 = bitcast float* %x.addr.014 to <4 x float>* |
| %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) |
| %add.ptr = getelementptr inbounds float, float* %x.addr.014, i32 4 |
| %3 = fsub fast <4 x float> %2, <float 10.0, float 10.0, float 10.0, float 10.0> |
| %4 = bitcast float* %y.addr.013 to <4 x float>* |
| tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %4, i32 4, <4 x i1> %0) |
| %add.ptr1 = getelementptr inbounds float, float* %y.addr.013, i32 4 |
| %sub = add nsw i32 %i.012, -4 |
| %cmp = icmp sgt i32 %i.012, 4 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @vsubf(float* %s1, i32 %N) { |
| ; CHECK-LABEL: vsubf: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r1, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB29_1: @ %while.body.preheader |
| ; CHECK-NEXT: movs r2, #0 |
| ; CHECK-NEXT: movt r2, #16672 |
| ; CHECK-NEXT: dlstp.32 lr, r1 |
| ; CHECK-NEXT: .LBB29_2: @ %while.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0] |
| ; CHECK-NEXT: vsub.f32 q0, q0, r2 |
| ; CHECK-NEXT: vstrw.32 q0, [r0], #16 |
| ; CHECK-NEXT: letp lr, .LBB29_2 |
| ; CHECK-NEXT: @ %bb.3: @ %while.end |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %N, 0 |
| br i1 %cmp11, label %while.body.lr.ph, label %while.end |
| |
| while.body.lr.ph: ; preds = %entry |
| br label %while.body |
| |
| while.body: ; preds = %while.body.lr.ph, %while.body |
| %s1.addr.013 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] |
| %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) |
| %1 = bitcast float* %s1.addr.013 to <4 x float>* |
| %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) |
| %3 = tail call fast <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> <float 10.0, float 10.0, float 10.0, float 10.0>, <4 x i1> %0, <4 x float> %2) |
| tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %1, i32 4, <4 x i1> %0) |
| %add.ptr = getelementptr inbounds float, float* %s1.addr.013, i32 4 |
| %sub = add nsw i32 %N.addr.012, -4 |
| %cmp = icmp sgt i32 %N.addr.012, 4 |
| br i1 %cmp, label %while.body, label %while.end |
| |
| while.end: ; preds = %while.body, %entry |
| ret void |
| } |
| |
| define void @vmulqf(float* %x, float* %y, i32 %n) { |
| ; CHECK-LABEL: vmulqf: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB30_1: @ %for.body.preheader |
| ; CHECK-NEXT: vmov.f32 q0, #1.000000e+01 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB30_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 |
| ; CHECK-NEXT: vmul.f32 q1, q1, q0 |
| ; CHECK-NEXT: vstrw.32 q1, [r1], #16 |
| ; CHECK-NEXT: letp lr, .LBB30_2 |
| ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %n, 0 |
| br i1 %cmp11, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| ret void |
| |
| for.body: ; preds = %entry, %for.body |
| %x.addr.014 = phi float* [ %add.ptr, %for.body ], [ %x, %entry ] |
| %y.addr.013 = phi float* [ %add.ptr1, %for.body ], [ %y, %entry ] |
| %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) |
| %1 = bitcast float* %x.addr.014 to <4 x float>* |
| %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) |
| %add.ptr = getelementptr inbounds float, float* %x.addr.014, i32 4 |
| %3 = fmul fast <4 x float> %2, <float 10.0, float 10.0, float 10.0, float 10.0> |
| %4 = bitcast float* %y.addr.013 to <4 x float>* |
| tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %4, i32 4, <4 x i1> %0) |
| %add.ptr1 = getelementptr inbounds float, float* %y.addr.013, i32 4 |
| %sub = add nsw i32 %i.012, -4 |
| %cmp = icmp sgt i32 %i.012, 4 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @vmulf(float* %s1, i32 %N) { |
| ; CHECK-LABEL: vmulf: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r1, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB31_1: @ %while.body.preheader |
| ; CHECK-NEXT: movs r2, #0 |
| ; CHECK-NEXT: movt r2, #16672 |
| ; CHECK-NEXT: dlstp.32 lr, r1 |
| ; CHECK-NEXT: .LBB31_2: @ %while.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0] |
| ; CHECK-NEXT: vmul.f32 q0, q0, r2 |
| ; CHECK-NEXT: vstrw.32 q0, [r0], #16 |
| ; CHECK-NEXT: letp lr, .LBB31_2 |
| ; CHECK-NEXT: @ %bb.3: @ %while.end |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp11 = icmp sgt i32 %N, 0 |
| br i1 %cmp11, label %while.body.lr.ph, label %while.end |
| |
| while.body.lr.ph: ; preds = %entry |
| br label %while.body |
| |
| while.body: ; preds = %while.body.lr.ph, %while.body |
| %s1.addr.013 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] |
| %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) |
| %1 = bitcast float* %s1.addr.013 to <4 x float>* |
| %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) |
| %3 = tail call fast <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> <float 10.0, float 10.0, float 10.0, float 10.0>, <4 x i1> %0, <4 x float> %2) |
| tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %1, i32 4, <4 x i1> %0) |
| %add.ptr = getelementptr inbounds float, float* %s1.addr.013, i32 4 |
| %sub = add nsw i32 %N.addr.012, -4 |
| %cmp = icmp sgt i32 %N.addr.012, 4 |
| br i1 %cmp, label %while.body, label %while.end |
| |
| while.end: ; preds = %while.body, %entry |
| ret void |
| } |
| |
| define void @vfmaq(float* %x, float* %y, i32 %n) { |
| ; CHECK-LABEL: vfmaq: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB32_1: @ %for.body.preheader |
| ; CHECK-NEXT: vmov.f32 q0, #1.000000e+01 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB32_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q1, [r1] |
| ; CHECK-NEXT: vldrw.u32 q2, [r0], #16 |
| ; CHECK-NEXT: vfma.f32 q2, q1, q0 |
| ; CHECK-NEXT: vstrw.32 q2, [r1], #16 |
| ; CHECK-NEXT: letp lr, .LBB32_2 |
| ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp14 = icmp sgt i32 %n, 0 |
| br i1 %cmp14, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| ret void |
| |
| for.body: ; preds = %entry, %for.body |
| %x.addr.017 = phi float* [ %add.ptr, %for.body ], [ %x, %entry ] |
| %y.addr.016 = phi float* [ %add.ptr1, %for.body ], [ %y, %entry ] |
| %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015) |
| %1 = bitcast float* %x.addr.017 to <4 x float>* |
| %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) |
| %add.ptr = getelementptr inbounds float, float* %x.addr.017, i32 4 |
| %3 = bitcast float* %y.addr.016 to <4 x float>* |
| %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %3, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) |
| %5 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %4, <4 x float> <float 10.0, float 10.0, float 10.0, float 10.0>, <4 x float> %2) |
| tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %3, i32 4, <4 x i1> %0) |
| %add.ptr1 = getelementptr inbounds float, float* %y.addr.016, i32 4 |
| %sub = add nsw i32 %i.015, -4 |
| %cmp = icmp sgt i32 %i.015, 4 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @vfma(float* %s1, float* %s2, i32 %N) { |
| ; CHECK-LABEL: vfma: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB33_1: @ %while.body.lr.ph |
| ; CHECK-NEXT: vmov.f32 q0, #1.000000e+01 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB33_2: @ %while.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q1, [r1] |
| ; CHECK-NEXT: vldrw.u32 q2, [r0] |
| ; CHECK-NEXT: vfma.f32 q2, q1, q0 |
| ; CHECK-NEXT: vstrw.32 q2, [r0], #16 |
| ; CHECK-NEXT: letp lr, .LBB33_2 |
| ; CHECK-NEXT: @ %bb.3: @ %while.end |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp12 = icmp sgt i32 %N, 0 |
| br i1 %cmp12, label %while.body.lr.ph, label %while.end |
| |
| while.body.lr.ph: ; preds = %entry |
| %0 = bitcast float* %s2 to <4 x float>* |
| br label %while.body |
| |
| while.body: ; preds = %while.body.lr.ph, %while.body |
| %s1.addr.014 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] |
| %N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] |
| %1 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013) |
| %2 = bitcast float* %s1.addr.014 to <4 x float>* |
| %3 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> zeroinitializer) |
| %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %1, <4 x float> zeroinitializer) |
| %5 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %4, <4 x float> <float 10.0, float 10.0, float 10.0, float 10.0>, <4 x float> %3, <4 x i1> %1) |
| tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %2, i32 4, <4 x i1> %1) |
| %add.ptr = getelementptr inbounds float, float* %s1.addr.014, i32 4 |
| %sub = add nsw i32 %N.addr.013, -4 |
| %cmp = icmp sgt i32 %N.addr.013, 4 |
| br i1 %cmp, label %while.body, label %while.end |
| |
| while.end: ; preds = %while.body, %entry |
| ret void |
| } |
| |
| define void @vfmasq(float* %x, float* %y, i32 %n) { |
| ; CHECK-LABEL: vfmasq: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB34_1: @ %for.body.preheader |
| ; CHECK-NEXT: vmov.f32 q0, #1.000000e+01 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB34_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vmov q3, q0 |
| ; CHECK-NEXT: vldrw.u32 q1, [r1] |
| ; CHECK-NEXT: vldrw.u32 q2, [r0], #16 |
| ; CHECK-NEXT: vfma.f32 q3, q2, q1 |
| ; CHECK-NEXT: vstrw.32 q3, [r1], #16 |
| ; CHECK-NEXT: letp lr, .LBB34_2 |
| ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp14 = icmp sgt i32 %n, 0 |
| br i1 %cmp14, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| ret void |
| |
| for.body: ; preds = %entry, %for.body |
| %x.addr.017 = phi float* [ %add.ptr, %for.body ], [ %x, %entry ] |
| %y.addr.016 = phi float* [ %add.ptr1, %for.body ], [ %y, %entry ] |
| %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ] |
| %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015) |
| %1 = bitcast float* %x.addr.017 to <4 x float>* |
| %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) |
| %add.ptr = getelementptr inbounds float, float* %x.addr.017, i32 4 |
| %3 = bitcast float* %y.addr.016 to <4 x float>* |
| %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %3, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) |
| %5 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %2, <4 x float> %4, <4 x float> <float 10.0, float 10.0, float 10.0, float 10.0>) |
| tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %3, i32 4, <4 x i1> %0) |
| %add.ptr1 = getelementptr inbounds float, float* %y.addr.016, i32 4 |
| %sub = add nsw i32 %i.015, -4 |
| %cmp = icmp sgt i32 %i.015, 4 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @vfmas(float* %s1, float* %s2, i32 %N) { |
| ; CHECK-LABEL: vfmas: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r7, lr} |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: cmp r2, #1 |
| ; CHECK-NEXT: it lt |
| ; CHECK-NEXT: poplt {r7, pc} |
| ; CHECK-NEXT: .LBB35_1: @ %while.body.lr.ph |
| ; CHECK-NEXT: vmov.f32 q0, #1.000000e+01 |
| ; CHECK-NEXT: dlstp.32 lr, r2 |
| ; CHECK-NEXT: .LBB35_2: @ %while.body |
| ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vmov q3, q0 |
| ; CHECK-NEXT: vldrw.u32 q1, [r1] |
| ; CHECK-NEXT: vldrw.u32 q2, [r0] |
| ; CHECK-NEXT: vfma.f32 q3, q2, q1 |
| ; CHECK-NEXT: vstrw.32 q3, [r0], #16 |
| ; CHECK-NEXT: letp lr, .LBB35_2 |
| ; CHECK-NEXT: @ %bb.3: @ %while.end |
| ; CHECK-NEXT: pop {r7, pc} |
| entry: |
| %cmp12 = icmp sgt i32 %N, 0 |
| br i1 %cmp12, label %while.body.lr.ph, label %while.end |
| |
| while.body.lr.ph: ; preds = %entry |
| %0 = bitcast float* %s2 to <4 x float>* |
| br label %while.body |
| |
| while.body: ; preds = %while.body.lr.ph, %while.body |
| %s1.addr.014 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] |
| %N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] |
| %1 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013) |
| %2 = bitcast float* %s1.addr.014 to <4 x float>* |
| %3 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> zeroinitializer) |
| %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %1, <4 x float> zeroinitializer) |
| %5 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %3, <4 x float> %4, <4 x float> <float 10.0, float 10.0, float 10.0, float 10.0>, <4 x i1> %1) |
| tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %2, i32 4, <4 x i1> %1) |
| %add.ptr = getelementptr inbounds float, float* %s1.addr.014, i32 4 |
| %sub = add nsw i32 %N.addr.013, -4 |
| %cmp = icmp sgt i32 %N.addr.013, 4 |
| br i1 %cmp, label %while.body, label %while.end |
| |
| while.end: ; preds = %while.body, %entry |
| ret void |
| } |
| |
| define void @rgbconvert(i32* noalias %pwSourceBase, i16 signext %iSourceStride, i16* noalias %phwTargetBase, i16 signext %iTargetStride, i16 %iHeight, i16 %iWidth) { |
| ; CHECK-LABEL: rgbconvert: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} |
| ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} |
| ; CHECK-NEXT: .pad #4 |
| ; CHECK-NEXT: sub sp, #4 |
| ; CHECK-NEXT: .vsave {d8, d9, d10, d11} |
| ; CHECK-NEXT: vpush {d8, d9, d10, d11} |
| ; CHECK-NEXT: .pad #8 |
| ; CHECK-NEXT: sub sp, #8 |
| ; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill |
| ; CHECK-NEXT: ldrsh.w r3, [sp, #80] |
| ; CHECK-NEXT: cmp r3, #1 |
| ; CHECK-NEXT: blt .LBB36_5 |
| ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph |
| ; CHECK-NEXT: mov r9, r2 |
| ; CHECK-NEXT: ldr r2, [sp, #84] |
| ; CHECK-NEXT: mov.w r10, #0 |
| ; CHECK-NEXT: mov.w r11, #8388608 |
| ; CHECK-NEXT: mov.w r4, #67108864 |
| ; CHECK-NEXT: sxth.w r12, r2 |
| ; CHECK-NEXT: vmov.i32 q0, #0xf800 |
| ; CHECK-NEXT: vmov.i32 q1, #0x1f |
| ; CHECK-NEXT: mov.w r2, #2016 |
| ; CHECK-NEXT: mov.w r7, #268435456 |
| ; CHECK-NEXT: vdup.32 q2, r2 |
| ; CHECK-NEXT: .LBB36_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Loop Header: Depth=1 |
| ; CHECK-NEXT: @ Child Loop BB36_3 Depth 2 |
| ; CHECK-NEXT: mov r2, r9 |
| ; CHECK-NEXT: mov r5, r0 |
| ; CHECK-NEXT: dlstp.32 lr, r12 |
| ; CHECK-NEXT: .LBB36_3: @ %do.body |
| ; CHECK-NEXT: @ Parent Loop BB36_2 Depth=1 |
| ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 |
| ; CHECK-NEXT: vldrw.u32 q3, [r5], #16 |
| ; CHECK-NEXT: vqdmulh.s32 q4, q3, r4 |
| ; CHECK-NEXT: vqdmulh.s32 q5, q3, r7 |
| ; CHECK-NEXT: vqdmulh.s32 q3, q3, r11 |
| ; CHECK-NEXT: vand q4, q4, q2 |
| ; CHECK-NEXT: vand q5, q5, q1 |
| ; CHECK-NEXT: vand q3, q3, q0 |
| ; CHECK-NEXT: vorr q4, q4, q5 |
| ; CHECK-NEXT: vorr q3, q4, q3 |
| ; CHECK-NEXT: vstrh.32 q3, [r2], #8 |
| ; CHECK-NEXT: letp lr, .LBB36_3 |
| ; CHECK-NEXT: @ %bb.4: @ %do.end |
| ; CHECK-NEXT: @ in Loop: Header=BB36_2 Depth=1 |
| ; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload |
| ; CHECK-NEXT: add.w r10, r10, #1 |
| ; CHECK-NEXT: add.w r0, r0, r1, lsl #2 |
| ; CHECK-NEXT: cmp r10, r3 |
| ; CHECK-NEXT: add.w r9, r9, r2, lsl #1 |
| ; CHECK-NEXT: bne .LBB36_2 |
| ; CHECK-NEXT: .LBB36_5: @ %for.cond.cleanup |
| ; CHECK-NEXT: add sp, #8 |
| ; CHECK-NEXT: vpop {d8, d9, d10, d11} |
| ; CHECK-NEXT: add sp, #4 |
| ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} |
| entry: |
| %conv = sext i16 %iHeight to i32 |
| %conv9 = sext i16 %iSourceStride to i32 |
| %conv11 = sext i16 %iTargetStride to i32 |
| %cmp37 = icmp sgt i16 %iHeight, 0 |
| br i1 %cmp37, label %for.body.lr.ph, label %for.cond.cleanup |
| |
| for.body.lr.ph: ; preds = %entry |
| %conv2 = sext i16 %iWidth to i32 |
| br label %for.body |
| |
| for.cond.cleanup: ; preds = %do.end, %entry |
| ret void |
| |
| for.body: ; preds = %for.body.lr.ph, %do.end |
| %pwSourceBase.addr.040 = phi i32* [ %pwSourceBase, %for.body.lr.ph ], [ %add.ptr10, %do.end ] |
| %phwTargetBase.addr.039 = phi i16* [ %phwTargetBase, %for.body.lr.ph ], [ %add.ptr12, %do.end ] |
| %y.038 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %do.end ] |
| br label %do.body |
| |
| do.body: ; preds = %do.body, %for.body |
| %pTarget.0 = phi i16* [ %phwTargetBase.addr.039, %for.body ], [ %add.ptr6, %do.body ] |
| %pSource.0 = phi i32* [ %pwSourceBase.addr.040, %for.body ], [ %add.ptr, %do.body ] |
| %blkCnt.0 = phi i32 [ %conv2, %for.body ], [ %sub, %do.body ] |
| %l2 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0) |
| %l3 = bitcast i32* %pSource.0 to <4 x i32>* |
| %l4 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %l3, i32 4, <4 x i1> %l2, <4 x i32> zeroinitializer) |
| %l5 = tail call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %l4, <4 x i32> <i32 268435456, i32 268435456, i32 268435456, i32 268435456>, <4 x i1> %l2, <4 x i32> undef) |
| %and = and <4 x i32> %l5, <i32 31, i32 31, i32 31, i32 31> |
| %l6 = tail call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %l4, <4 x i32> <i32 67108864, i32 67108864, i32 67108864, i32 67108864>, <4 x i1> %l2, <4 x i32> undef) |
| %and3 = and <4 x i32> %l6, <i32 2016, i32 2016, i32 2016, i32 2016> |
| %l7 = tail call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %l4, <4 x i32> <i32 8388608, i32 8388608, i32 8388608, i32 8388608>, <4 x i1> %l2, <4 x i32> undef) |
| %and4 = and <4 x i32> %l7, <i32 63488, i32 63488, i32 63488, i32 63488> |
| %or = or <4 x i32> %and3, %and |
| %or5 = or <4 x i32> %or, %and4 |
| %l8 = trunc <4 x i32> %or5 to <4 x i16> |
| %l9 = bitcast i16* %pTarget.0 to <4 x i16>* |
| tail call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %l8, <4 x i16>* %l9, i32 2, <4 x i1> %l2) |
| %add.ptr = getelementptr inbounds i32, i32* %pSource.0, i32 4 |
| %add.ptr6 = getelementptr inbounds i16, i16* %pTarget.0, i32 4 |
| %sub = add nsw i32 %blkCnt.0, -4 |
| %cmp7 = icmp sgt i32 %blkCnt.0, 4 |
| br i1 %cmp7, label %do.body, label %do.end |
| |
| do.end: ; preds = %do.body |
| %add.ptr10 = getelementptr inbounds i32, i32* %pwSourceBase.addr.040, i32 %conv9 |
| %add.ptr12 = getelementptr inbounds i16, i16* %phwTargetBase.addr.039, i32 %conv11 |
| %inc = add nuw nsw i32 %y.038, 1 |
| %exitcond.not = icmp eq i32 %inc, %conv |
| br i1 %exitcond.not, label %for.cond.cleanup, label %for.body |
| } |
| |
| declare <4 x i1> @llvm.arm.mve.vctp32(i32) |
| declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) |
| declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) |
| declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) |
| declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) |
| declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) |
| declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32 immarg, <4 x i1>) #3 |
| |
| declare <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) |
| declare <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) |
| declare <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) |
| declare <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) |
| declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>) |
| declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>) |
| declare <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) |
| declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) |
| declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) |
| declare <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) |
| declare <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32>, <4 x i32>, i32) |
| declare <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) |
| declare <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32>, <4 x i32>, i32) |
| declare <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32>, <4 x i32>, i32) #1 |
| declare <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16>, <8 x i16>, i32, <4 x i1>, <4 x i32>) |
| declare <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) |
| declare <4 x i32> @llvm.arm.mve.vqdmulh.v4i32(<4 x i32>, <4 x i32>) |
| declare <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) |
| declare <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) |
| declare <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>) |
| declare <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>) |
| declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) |
| declare <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) |
| declare <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) |
| declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>) |
| declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) |