blob: 86150a8d3d3cee4a386a53cc71dc172b21bff719 [file] [log] [blame]
; RUN: opt -passes=loop-unroll %s -o - | llc -O3 - -mtriple=arm64e-apple-darwin -o - | FileCheck %s
define i16 @sabal_i8_to_i16_accumulation(ptr %ptr1, ptr %ptr2) {
; CHECK-LABEL: sabal_i8_to_i16_accumulation
entry:
br label %loop
loop:
; CHECK: sabdl.8h v1
; CHECK: sabdl.8h v0
; CHECK: sabdl.8h v2
; CHECK: sabal.8h v1
; CHECK: sabal.8h v0
; CHECK: sabal.8h v2
; CHECK: sabal.8h v1
; CHECK: sabal.8h v0
; CHECK: add.8h v1, v2, v1
; CHECK: add.8h v0, v1, v0
; CHECK: addv.8h
%i = phi i32 [ 0, %entry ], [ %next_i, %loop ]
%acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
%ptr1_i = getelementptr i8, ptr %ptr1, i32 %i
%ptr2_i = getelementptr i8, ptr %ptr2, i32 %i
%a = load <8 x i8>, <8 x i8>* %ptr1_i, align 1
%b = load <8 x i8>, <8 x i8>* %ptr2_i, align 1
%vabd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b)
%vabd_ext = zext <8 x i8> %vabd to <8 x i16>
%acc_next = add <8 x i16> %vabd_ext, %acc_phi
%next_i = add i32 %i, 8
%cmp = icmp slt i32 %next_i, 64
br i1 %cmp, label %loop, label %exit
exit:
%reduce = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %acc_next)
ret i16 %reduce
}
; Declare the signed absolute difference intrinsic
declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
define i32 @sabal_i16_to_i32_accumulation(ptr %ptr1, ptr %ptr2) {
; CHECK-LABEL: sabal_i16_to_i32_accumulation
entry:
br label %loop
loop:
; CHECK: sabdl.4s v1
; CHECK: sabdl.4s v0
; CHECK: sabdl.4s v2
; CHECK: sabal.4s v1
; CHECK: sabal.4s v0
; CHECK: sabal.4s v2
; CHECK: sabal.4s v1
; CHECK: sabal.4s v0
; CHECK: add.4s v1, v2, v1
; CHECK: add.4s v0, v1, v0
; CHECK: addv.4s
%i = phi i32 [ 0, %entry ], [ %next_i, %loop ]
%acc_phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
%ptr1_i = getelementptr i16, ptr %ptr1, i32 %i
%ptr2_i = getelementptr i16, ptr %ptr2, i32 %i
%a = load <4 x i16>, <4 x i16>* %ptr1_i, align 1
%b = load <4 x i16>, <4 x i16>* %ptr2_i, align 1
%vabd = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b)
%vmov = zext <4 x i16> %vabd to <4 x i32>
%acc_next = add <4 x i32> %vmov, %acc_phi
%next_i = add i32 %i, 4
%cmp = icmp slt i32 %next_i, 32
br i1 %cmp, label %loop, label %exit
exit:
%reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %acc_next)
ret i32 %reduce
}
declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
define i16 @uabal2_accumulation(ptr %ptr1, ptr %ptr2) {
; CHECK-LABEL: uabal2_accumulation
entry:
br label %loop
loop:
; CHECK: uabdl2.8h v4
; CHECK: uabdl.8h v1
; CHECK: uabdl2.8h v24
; CHECK: uabdl2.8h v25
; CHECK: uabal2.8h v4
; CHECK: uabal2.8h v24
; CHECK: uabal2.8h v25
; CHECK: uabal2.8h v4
; CHECK: uabal2.8h v24
; CHECK: add.8h v4, v25, v4
; CHECK: add.8h v4, v4, v24
; CHECK: uabdl.8h v0
; CHECK: uabdl.8h v2
; CHECK: uabal.8h v1
; CHECK: uabal.8h v0
; CHECK: uabal.8h v2
; CHECK: uabal.8h v1
; CHECK: uabal.8h v0
; CHECK: add.8h v1, v2, v1
; CHECK: add.8h v0, v1, v0
; CHECK: add.8h v0, v4, v0
; CHECK: addv.8h h0, v0
%i = phi i32 [ 0, %entry ], [ %next_i, %loop ]
%acc_phi_hi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next_hi, %loop ]
%acc_phi_lo = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next_lo, %loop ]
%ptr1_i = getelementptr i8, ptr %ptr1, i32 %i
%ptr2_i = getelementptr i8, ptr %ptr2, i32 %i
%a = load <16 x i8>, <16 x i8>* %ptr1_i, align 1
%b = load <16 x i8>, <16 x i8>* %ptr2_i, align 1
%a_hi = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%b_hi = shufflevector <16 x i8> %b, <16 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%a_lo = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%b_lo = shufflevector <16 x i8> %b, <16 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%vabd_hi = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a_hi, <8 x i8> %b_hi)
%vabd_lo = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a_lo, <8 x i8> %b_lo)
%vmov_hi = zext <8 x i8> %vabd_hi to <8 x i16>
%vmov_lo = zext <8 x i8> %vabd_lo to <8 x i16>
%acc_next_hi = add <8 x i16> %vmov_hi, %acc_phi_hi
%acc_next_lo = add <8 x i16> %vmov_lo, %acc_phi_lo
%next_i = add i32 %i, 16
%cmp = icmp slt i32 %next_i, 128
br i1 %cmp, label %loop, label %exit
exit:
%hi_plus_lo = add <8 x i16> %acc_next_hi, %acc_next_lo
%reduce = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %hi_plus_lo)
ret i16 %reduce
}
define i32 @uaba_accumulation(ptr %ptr1, ptr %ptr2) {
; CHECK-LABEL: uaba_accumulation
entry:
br label %loop
loop:
; CHECK: uabd.4s v0
; CHECK: uabd.4s v1
; CHECK: uabd.4s v2
; CHECK: uaba.4s v0
; CHECK: uaba.4s v1
; CHECK: uaba.4s v2
; CHECK: uaba.4s v0
; CHECK: uaba.4s v1
; CHECK: add.4s v0, v2, v0
; CHECK: add.4s v0, v0, v1
; CHECK: addv.4s
%i = phi i32 [ 0, %entry ], [ %next_i, %loop ]
%acc_phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
%ptr1_i = getelementptr i32, ptr %ptr1, i32 %i
%ptr2_i = getelementptr i32, ptr %ptr2, i32 %i
%a = load <4 x i32>, <4 x i32>* %ptr1_i, align 1
%b = load <4 x i32>, <4 x i32>* %ptr2_i, align 1
%vabd = tail call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %a, <4 x i32> %b)
%acc_next = add <4 x i32> %acc_phi, %vabd
%next_i = add i32 %i, 4
%cmp = icmp slt i32 %next_i, 32
br i1 %cmp, label %loop, label %exit
exit:
%reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %acc_next)
ret i32 %reduce
}
declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
define i32 @saba_accumulation(ptr %ptr1, ptr %ptr2) {
; CHECK-LABEL: saba_accumulation
entry:
br label %loop
loop:
; CHECK: sabd.4s v0
; CHECK: sabd.4s v1
; CHECK: sabd.4s v2
; CHECK: saba.4s v0
; CHECK: saba.4s v1
; CHECK: saba.4s v2
; CHECK: saba.4s v0
; CHECK: saba.4s v1
; CHECK: add.4s v0, v2, v0
; CHECK: add.4s v0, v0, v1
; CHECK: addv.4s
%i = phi i32 [ 0, %entry ], [ %next_i, %loop ]
%acc_phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
; Load values from ptr1 and ptr2
%ptr1_i = getelementptr i32, ptr %ptr1, i32 %i
%ptr2_i = getelementptr i32, ptr %ptr2, i32 %i
%a = load <4 x i32>, <4 x i32>* %ptr1_i, align 1
%b = load <4 x i32>, <4 x i32>* %ptr2_i, align 1
; Perform the intrinsic operation
%vabd = tail call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %a, <4 x i32> %b)
%acc_next = add <4 x i32> %acc_phi, %vabd
; Increment loop counter and check the bound
%next_i = add i32 %i, 4
%cmp = icmp slt i32 %next_i, 32
br i1 %cmp, label %loop, label %exit
exit:
%reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %acc_next)
ret i32 %reduce
}
declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
define i32 @uaba_v2i32_accumulation(ptr %ptr1, ptr %ptr2) {
; CHECK-LABEL: uaba_v2i32_accumulation
entry:
br label %loop
loop:
; CHECK: uabd.2s v0
; CHECK: uabd.2s v1
; CHECK: uabd.2s v2
; CHECK: uaba.2s v0
; CHECK: uaba.2s v1
; CHECK: uaba.2s v2
; CHECK: uaba.2s v0
; CHECK: uaba.2s v1
; CHECK: add.2s v0, v2, v0
; CHECK: add.2s v0, v0, v1
; CHECK: addp.2s
%i = phi i32 [ 0, %entry ], [ %next_i, %loop ]
%acc_phi = phi <2 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
%ptr1_i = getelementptr i32, ptr %ptr1, i32 %i
%ptr2_i = getelementptr i32, ptr %ptr2, i32 %i
%a = load <2 x i32>, <2 x i32>* %ptr1_i, align 1
%b = load <2 x i32>, <2 x i32>* %ptr2_i, align 1
%vabd = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b)
%acc_next = add <2 x i32> %acc_phi, %vabd
%next_i = add i32 %i, 2
%cmp = icmp slt i32 %next_i, 16
br i1 %cmp, label %loop, label %exit
exit:
%reduce = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %acc_next)
ret i32 %reduce
}
define i8 @uaba_v8i8_accumulation(ptr %ptr1, ptr %ptr2) {
; CHECK-LABEL: uaba_v8i8_accumulation
entry:
br label %loop
loop:
; CHECK: uabd.8b v0
; CHECK: uabd.8b v1
; CHECK: uabd.8b v2
; CHECK: uaba.8b v0
; CHECK: uaba.8b v1
; CHECK: uaba.8b v2
; CHECK: uaba.8b v0
; CHECK: uaba.8b v1
; CHECK: add.8b v0, v2, v0
; CHECK: add.8b v0, v0, v1
; CHECK: addv.8b
%i = phi i32 [ 0, %entry ], [ %next_i, %loop ]
%acc_phi = phi <8 x i8> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
%ptr1_i = getelementptr i8, ptr %ptr1, i32 %i
%ptr2_i = getelementptr i8, ptr %ptr2, i32 %i
%a = load <8 x i8>, <8 x i8>* %ptr1_i, align 1
%b = load <8 x i8>, <8 x i8>* %ptr2_i, align 1
%vabd = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b)
%acc_next = add <8 x i8> %acc_phi, %vabd
%next_i = add i32 %i, 8
%cmp = icmp slt i32 %next_i, 64
br i1 %cmp, label %loop, label %exit
exit:
%reduce = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %acc_next)
ret i8 %reduce
}
define i8 @uaba_v16i8_accumulation(ptr %ptr1, ptr %ptr2) {
; CHECK-LABEL: uaba_v16i8_accumulation
entry:
br label %loop
loop:
; CHECK: uabd.16b v0
; CHECK: uabd.16b v1
; CHECK: uabd.16b v2
; CHECK: uaba.16b v0
; CHECK: uaba.16b v1
; CHECK: uaba.16b v2
; CHECK: uaba.16b v0
; CHECK: uaba.16b v1
; CHECK: add.16b v0, v2, v0
; CHECK: add.16b v0, v0, v1
; CHECK: addv.16b
%i = phi i32 [ 0, %entry ], [ %next_i, %loop ]
%acc_phi = phi <16 x i8> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
%ptr1_i = getelementptr i8, ptr %ptr1, i32 %i
%ptr2_i = getelementptr i8, ptr %ptr2, i32 %i
%a = load <16 x i8>, <16 x i8>* %ptr1_i, align 1
%b = load <16 x i8>, <16 x i8>* %ptr2_i, align 1
%vabd = tail call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %a, <16 x i8> %b)
%acc_next = add <16 x i8> %acc_phi, %vabd
%next_i = add i32 %i, 16
%cmp = icmp slt i32 %next_i, 128
br i1 %cmp, label %loop, label %exit
exit:
%reduce = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %acc_next)
ret i8 %reduce
}
define i16 @uaba_v8i16_accumulation(ptr %ptr1, ptr %ptr2) {
; CHECK-LABEL: uaba_v8i16_accumulation
entry:
br label %loop
loop:
; CHECK: uabd.8h v0
; CHECK: uabd.8h v1
; CHECK: uabd.8h v2
; CHECK: uaba.8h v0
; CHECK: uaba.8h v1
; CHECK: uaba.8h v2
; CHECK: uaba.8h v0
; CHECK: uaba.8h v1
; CHECK: add.8h v0, v2, v0
; CHECK: add.8h v0, v0, v1
; CHECK: addv.8h
%i = phi i32 [ 0, %entry ], [ %next_i, %loop ]
%acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
%ptr1_i = getelementptr i16, ptr %ptr1, i32 %i
%ptr2_i = getelementptr i16, ptr %ptr2, i32 %i
%a = load <8 x i16>, <8 x i16>* %ptr1_i, align 1
%b = load <8 x i16>, <8 x i16>* %ptr2_i, align 1
%vabd = tail call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %a, <8 x i16> %b)
%acc_next = add <8 x i16> %acc_phi, %vabd
%next_i = add i32 %i, 8
%cmp = icmp slt i32 %next_i, 64
br i1 %cmp, label %loop, label %exit
exit:
%reduce = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %acc_next)
ret i16 %reduce
}
define i8 @saba_v8i8_accumulation(ptr %ptr1, ptr %ptr2) {
; CHECK-LABEL: saba_v8i8_accumulation
entry:
br label %loop
loop:
; CHECK: sabd.8b v0
; CHECK: sabd.8b v1
; CHECK: sabd.8b v2
; CHECK: saba.8b v0
; CHECK: saba.8b v1
; CHECK: saba.8b v2
; CHECK: saba.8b v0
; CHECK: saba.8b v1
; CHECK: add.8b v0, v2, v0
; CHECK: add.8b v0, v0, v1
; CHECK: addv.8b
%i = phi i32 [ 0, %entry ], [ %next_i, %loop ]
%acc_phi = phi <8 x i8> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
%ptr1_i = getelementptr i8, ptr %ptr1, i32 %i
%ptr2_i = getelementptr i8, ptr %ptr2, i32 %i
%a = load <8 x i8>, <8 x i8>* %ptr1_i, align 1
%b = load <8 x i8>, <8 x i8>* %ptr2_i, align 1
%vabd = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b)
%acc_next = add <8 x i8> %acc_phi, %vabd
%next_i = add i32 %i, 8
%cmp = icmp slt i32 %next_i, 64
br i1 %cmp, label %loop, label %exit
exit:
%reduce = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %acc_next)
ret i8 %reduce
}
define i16 @saba_v4i16_accumulation(ptr %ptr1, ptr %ptr2) {
; CHECK-LABEL: saba_v4i16_accumulation
entry:
br label %loop
loop:
; CHECK: sabd.4h v0
; CHECK: sabd.4h v1
; CHECK: sabd.4h v2
; CHECK: saba.4h v0
; CHECK: saba.4h v1
; CHECK: saba.4h v2
; CHECK: saba.4h v0
; CHECK: saba.4h v1
; CHECK: add.4h v0, v2, v0
; CHECK: add.4h v0, v0, v1
; CHECK: addv.4h
%i = phi i32 [ 0, %entry ], [ %next_i, %loop ]
%acc_phi = phi <4 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
%ptr1_i = getelementptr i16, ptr %ptr1, i32 %i
%ptr2_i = getelementptr i16, ptr %ptr2, i32 %i
%a = load <4 x i16>, <4 x i16>* %ptr1_i, align 1
%b = load <4 x i16>, <4 x i16>* %ptr2_i, align 1
%vabd = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b)
%acc_next = add <4 x i16> %acc_phi, %vabd
%next_i = add i32 %i, 4
%cmp = icmp slt i32 %next_i, 32
br i1 %cmp, label %loop, label %exit
exit:
%reduce = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %acc_next)
ret i16 %reduce
}
define i16 @saba_v8i16_accumulation(ptr %ptr1, ptr %ptr2) {
; CHECK-LABEL: saba_v8i16_accumulation
entry:
br label %loop
loop:
; CHECK: sabd.8h v0
; CHECK: sabd.8h v1
; CHECK: sabd.8h v2
; CHECK: saba.8h v0
; CHECK: saba.8h v1
; CHECK: saba.8h v2
; CHECK: saba.8h v0
; CHECK: saba.8h v1
; CHECK: add.8h v0, v2, v0
; CHECK: add.8h v0, v0, v1
; CHECK: addv.8h
%i = phi i32 [ 0, %entry ], [ %next_i, %loop ]
%acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
%ptr1_i = getelementptr i16, ptr %ptr1, i32 %i
%ptr2_i = getelementptr i16, ptr %ptr2, i32 %i
%a = load <8 x i16>, <8 x i16>* %ptr1_i, align 1
%b = load <8 x i16>, <8 x i16>* %ptr2_i, align 1
%vabd = tail call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %a, <8 x i16> %b)
%acc_next = add <8 x i16> %acc_phi, %vabd
%next_i = add i32 %i, 8
%cmp = icmp slt i32 %next_i, 64
br i1 %cmp, label %loop, label %exit
exit:
%reduce = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %acc_next)
ret i16 %reduce
}
define i16 @uabal_i8_to_i16_accumulation(ptr %ptr1, ptr %ptr2) {
; CHECK-LABEL: uabal_i8_to_i16_accumulation
entry:
br label %loop
loop:
; CHECK: uabdl.8h v1
; CHECK: uabdl.8h v0
; CHECK: uabdl.8h v2
; CHECK: uabal.8h v1
; CHECK: uabal.8h v0
; CHECK: uabal.8h v2
; CHECK: uabal.8h v1
; CHECK: uabal.8h v0
; CHECK: add.8h v1, v2, v1
; CHECK: add.8h v0, v1, v0
; CHECK: addv.8h
%i = phi i32 [ 0, %entry ], [ %next_i, %loop ]
%acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
%ptr1_i = getelementptr i8, ptr %ptr1, i32 %i
%ptr2_i = getelementptr i8, ptr %ptr2, i32 %i
%a = load <8 x i8>, <8 x i8>* %ptr1_i, align 1
%b = load <8 x i8>, <8 x i8>* %ptr2_i, align 1
%vabd = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b)
%vmov = zext <8 x i8> %vabd to <8 x i16>
%acc_next = add <8 x i16> %vmov, %acc_phi
%next_i = add i32 %i, 8
%cmp = icmp slt i32 %next_i, 64
br i1 %cmp, label %loop, label %exit
exit:
%reduce = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %acc_next)
ret i16 %reduce
}
define i32 @uabal_i16_to_i32_accumulation(ptr %ptr1, ptr %ptr2) {
; CHECK-LABEL: uabal_i16_to_i32_accumulation
entry:
br label %loop
loop:
; CHECK: uabdl.4s v1
; CHECK: uabdl.4s v0
; CHECK: uabdl.4s v2
; CHECK: uabal.4s v1
; CHECK: uabal.4s v0
; CHECK: uabal.4s v2
; CHECK: uabal.4s v1
; CHECK: uabal.4s v0
; CHECK: add.4s v1, v2, v1
; CHECK: add.4s v0, v1, v0
; CHECK: addv.4s
%i = phi i32 [ 0, %entry ], [ %next_i, %loop ]
%acc_phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
%ptr1_i = getelementptr i16, ptr %ptr1, i32 %i
%ptr2_i = getelementptr i16, ptr %ptr2, i32 %i
%a = load <4 x i16>, <4 x i16>* %ptr1_i, align 1
%b = load <4 x i16>, <4 x i16>* %ptr2_i, align 1
%vabd = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b)
%vmov = zext <4 x i16> %vabd to <4 x i32>
%acc_next = add <4 x i32> %vmov, %acc_phi
%next_i = add i32 %i, 4
%cmp = icmp slt i32 %next_i, 32
br i1 %cmp, label %loop, label %exit
exit:
%reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %acc_next)
ret i32 %reduce
}