blob: 3d5945d70768c73c21af80e5b6f4864c76513be4 [file]
; REQUIRES: asserts
; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=false -debug-only=loop-vectorize,vplan -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NOMAX
; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=true -debug-only=loop-vectorize,vplan -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-REGS-VP
; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=true -debug-only=loop-vectorize,vplan -disable-output -force-target-num-vector-regs=1 -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NOREGS-VP
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-none-unknown-elf"
; The use of the dotp instruction means we never have an i32 vector, so we don't
; get any spills normally and with a reduced number of registers the number of
; spills is small enough that it doesn't prevent use of a larger VF.
define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-LABEL: LV: Checking a loop in 'dotp'
;
; CHECK-NOMAX: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.
; CHECK-NOMAX: LV: Selecting VF: vscale x 4.
;
; CHECK-REGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.
; CHECK-REGS-VP: Cost for VF vscale x 8: 5 (Estimated cost per lane: 0.
; CHECK-REGS-VP: Cost for VF vscale x 16: 5 (Estimated cost per lane: 0.
; CHECK-REGS-VP: LV: Selecting VF: vscale x 16.
;
; CHECK-NOREGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.
; CHECK-NOREGS-VP: LV(REG): Cost of 4 from 2 spills of Generic::VectorRC
; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 8: 13 (Estimated cost per lane: 1.
; CHECK-NOREGS-VP: LV(REG): Cost of 4 from 2 spills of Generic::VectorRC
; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 16: 13 (Estimated cost per lane: 0.
; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 16.
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
%gep.a = getelementptr i8, ptr %a, i64 %iv
%load.a = load i8, ptr %gep.a, align 1
%ext.a = zext i8 %load.a to i32
%gep.b = getelementptr i8, ptr %b, i64 %iv
%load.b = load i8, ptr %gep.b, align 1
%ext.b = zext i8 %load.b to i32
%mul = mul i32 %ext.b, %ext.a
%add = add i32 %accum, %mul
%iv.next = add i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %for.exit, label %for.body
for.exit:
ret i32 %add
}
; The largest type used in the loop is small enough that we already consider all
; VFs and maximize-bandwidth does nothing.
define void @type_too_small(ptr %a, ptr %b) #0 {
; CHECK-LABEL: LV: Checking a loop in 'type_too_small'
; CHECK: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.
; CHECK: Cost for VF vscale x 8: 6 (Estimated cost per lane: 0.
; CHECK: Cost for VF vscale x 16: 6 (Estimated cost per lane: 0.
; CHECK: LV: Selecting VF: vscale x 16.
entry:
br label %loop
loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%gep.a = getelementptr i8, ptr %a, i64 %iv
%load.a = load i8, ptr %gep.a, align 1
%gep.b = getelementptr i8, ptr %b, i64 %iv
%load.b = load i8, ptr %gep.b, align 1
%add = add i8 %load.a, %load.b
store i8 %add, ptr %gep.a, align 1
%iv.next = add i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1024
br i1 %exitcond, label %exit, label %loop
exit:
ret void
}
; With reduced number of registers the spills from high pressure are enough that
; we use the same VF as if we hadn't maximized the bandwidth.
define void @high_pressure(ptr %a, ptr %b) #0 {
; CHECK-LABEL: LV: Checking a loop in 'high_pressure'
;
; CHECK-NOMAX: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.
; CHECK-NOMAX: LV: Selecting VF: vscale x 4.
;
; CHECK-REGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.
; CHECK-REGS-VP: Cost for VF vscale x 8: 10 (Estimated cost per lane: 1.
; CHECK-REGS-VP: Cost for VF vscale x 16: 21 (Estimated cost per lane: 1.
; CHECK-REGS-VP: LV: Selecting VF: vscale x 8.
; CHECK-NOREGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.
; CHECK-NOREGS-VP: LV(REG): Cost of 6 from 3 spills of Generic::VectorRC
; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 8: 20 (Estimated cost per lane: 2.
; CHECK-NOREGS-VP: LV(REG): Cost of 14 from 7 spills of Generic::VectorRC
; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 16: 39 (Estimated cost per lane: 2.
; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 4.
entry:
br label %loop
loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%gep.a = getelementptr i32, ptr %a, i64 %iv
%load.a = load i32, ptr %gep.a, align 4
%gep.b = getelementptr i8, ptr %b, i64 %iv
%load.b = load i8, ptr %gep.b, align 1
%ext.b = zext i8 %load.b to i32
%add = add i32 %load.a, %ext.b
store i32 %add, ptr %gep.a, align 4
%iv.next = add i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1024
br i1 %exitcond, label %exit, label %loop
exit:
ret void
}
attributes #0 = { vscale_range(1,16) "target-features"="+sve" }