llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll - llvm-project - Git at Google

 ; REQUIRES: asserts
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=false -debug-only=loop-vectorize,vplan -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NOMAX
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=true -debug-only=loop-vectorize,vplan -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-REGS-VP
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=true -debug-only=loop-vectorize,vplan -disable-output -force-target-num-vector-regs=1 -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NOREGS-VP

 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-none-unknown-elf"

 ; The use of the dotp instruction means we never have an i32 vector, so we don't
 ; get any spills normally and with a reduced number of registers the number of
 ; spills is small enough that it doesn't prevent use of a larger VF.
 define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-LABEL: LV: Checking a loop in 'dotp'
 ;
 ; CHECK-NOMAX: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.
 ; CHECK-NOMAX: LV: Selecting VF: vscale x 4.
 ;
 ; CHECK-REGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.
 ; CHECK-REGS-VP: Cost for VF vscale x 8: 5 (Estimated cost per lane: 0.
 ; CHECK-REGS-VP: Cost for VF vscale x 16: 5 (Estimated cost per lane: 0.
 ; CHECK-REGS-VP: LV: Selecting VF: vscale x 16.
 ;
 ; CHECK-NOREGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.
 ; CHECK-NOREGS-VP: LV(REG): Cost of 4 from 2 spills of Generic::VectorRC
 ; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 8: 13 (Estimated cost per lane: 1.
 ; CHECK-NOREGS-VP: LV(REG): Cost of 4 from 2 spills of Generic::VectorRC
 ; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 16: 13 (Estimated cost per lane: 0.
 ; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 16.
 entry:
   br label %for.body

 for.body:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
   %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
   %gep.a = getelementptr i8, ptr %a, i64 %iv
   %load.a = load i8, ptr %gep.a, align 1
   %ext.a = zext i8 %load.a to i32
   %gep.b = getelementptr i8, ptr %b, i64 %iv
   %load.b = load i8, ptr %gep.b, align 1
   %ext.b = zext i8 %load.b to i32
   %mul = mul i32 %ext.b, %ext.a
   %add = add i32 %accum, %mul
   %iv.next = add i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, 1024
   br i1 %exitcond.not, label %for.exit, label %for.body

 for.exit:
   ret i32 %add
 }

 ; The largest type used in the loop is small enough that we already consider all
 ; VFs and maximize-bandwidth does nothing.
 define void @type_too_small(ptr %a, ptr %b) #0 {
 ; CHECK-LABEL: LV: Checking a loop in 'type_too_small'
 ; CHECK: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.
 ; CHECK: Cost for VF vscale x 8: 6 (Estimated cost per lane: 0.
 ; CHECK: Cost for VF vscale x 16: 6 (Estimated cost per lane: 0.
 ; CHECK: LV: Selecting VF: vscale x 16.
 entry:
   br label %loop

 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
   %gep.a = getelementptr i8, ptr %a, i64 %iv
   %load.a = load i8, ptr %gep.a, align 1
   %gep.b = getelementptr i8, ptr %b, i64 %iv
   %load.b = load i8, ptr %gep.b, align 1
   %add = add i8 %load.a, %load.b
   store i8 %add, ptr %gep.a, align 1
   %iv.next = add i64 %iv, 1
   %exitcond = icmp eq i64 %iv.next, 1024
   br i1 %exitcond, label %exit, label %loop

 exit:
   ret void
 }

 ; With reduced number of registers the spills from high pressure are enough that
 ; we use the same VF as if we hadn't maximized the bandwidth.
 define void @high_pressure(ptr %a, ptr %b) #0 {
 ; CHECK-LABEL: LV: Checking a loop in 'high_pressure'
 ;
 ; CHECK-NOMAX: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.
 ; CHECK-NOMAX: LV: Selecting VF: vscale x 4.
 ;
 ; CHECK-REGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.
 ; CHECK-REGS-VP: Cost for VF vscale x 8: 10 (Estimated cost per lane: 1.
 ; CHECK-REGS-VP: Cost for VF vscale x 16: 21 (Estimated cost per lane: 1.
 ; CHECK-REGS-VP: LV: Selecting VF: vscale x 8.

 ; CHECK-NOREGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.
 ; CHECK-NOREGS-VP: LV(REG): Cost of 6 from 3 spills of Generic::VectorRC
 ; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 8: 20 (Estimated cost per lane: 2.
 ; CHECK-NOREGS-VP: LV(REG): Cost of 14 from 7 spills of Generic::VectorRC
 ; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 16: 39 (Estimated cost per lane: 2.
 ; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 4.
 entry:
   br label %loop

 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
   %gep.a = getelementptr i32, ptr %a, i64 %iv
   %load.a = load i32, ptr %gep.a, align 4
   %gep.b = getelementptr i8, ptr %b, i64 %iv
   %load.b = load i8, ptr %gep.b, align 1
   %ext.b = zext i8 %load.b to i32
   %add = add i32 %load.a, %ext.b
   store i32 %add, ptr %gep.a, align 4
   %iv.next = add i64 %iv, 1
   %exitcond = icmp eq i64 %iv.next, 1024
   br i1 %exitcond, label %exit, label %loop

 exit:
   ret void
 }

 attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
	; REQUIRES: asserts
	; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=false -debug-only=loop-vectorize,vplan -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 \| FileCheck %s --check-prefixes=CHECK,CHECK-NOMAX
	; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=true -debug-only=loop-vectorize,vplan -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 \| FileCheck %s --check-prefixes=CHECK,CHECK-REGS-VP
	; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=true -debug-only=loop-vectorize,vplan -disable-output -force-target-num-vector-regs=1 -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 \| FileCheck %s --check-prefixes=CHECK,CHECK-NOREGS-VP

	target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
	target triple = "aarch64-none-unknown-elf"

	; The use of the dotp instruction means we never have an i32 vector, so we don't
	; get any spills normally and with a reduced number of registers the number of
	; spills is small enough that it doesn't prevent use of a larger VF.
	define i32 @dotp(ptr %a, ptr %b) #0 {
	; CHECK-LABEL: LV: Checking a loop in 'dotp'
	;
	; CHECK-NOMAX: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.
	; CHECK-NOMAX: LV: Selecting VF: vscale x 4.
	;
	; CHECK-REGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.
	; CHECK-REGS-VP: Cost for VF vscale x 8: 5 (Estimated cost per lane: 0.
	; CHECK-REGS-VP: Cost for VF vscale x 16: 5 (Estimated cost per lane: 0.
	; CHECK-REGS-VP: LV: Selecting VF: vscale x 16.
	;
	; CHECK-NOREGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.
	; CHECK-NOREGS-VP: LV(REG): Cost of 4 from 2 spills of Generic::VectorRC
	; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 8: 13 (Estimated cost per lane: 1.
	; CHECK-NOREGS-VP: LV(REG): Cost of 4 from 2 spills of Generic::VectorRC
	; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 16: 13 (Estimated cost per lane: 0.
	; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 16.
	entry:
	br label %for.body

	for.body:
	%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
	%accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
	%gep.a = getelementptr i8, ptr %a, i64 %iv
	%load.a = load i8, ptr %gep.a, align 1
	%ext.a = zext i8 %load.a to i32
	%gep.b = getelementptr i8, ptr %b, i64 %iv
	%load.b = load i8, ptr %gep.b, align 1
	%ext.b = zext i8 %load.b to i32
	%mul = mul i32 %ext.b, %ext.a
	%add = add i32 %accum, %mul
	%iv.next = add i64 %iv, 1
	%exitcond.not = icmp eq i64 %iv.next, 1024
	br i1 %exitcond.not, label %for.exit, label %for.body

	for.exit:
	ret i32 %add
	}

	; The largest type used in the loop is small enough that we already consider all
	; VFs and maximize-bandwidth does nothing.
	define void @type_too_small(ptr %a, ptr %b) #0 {
	; CHECK-LABEL: LV: Checking a loop in 'type_too_small'
	; CHECK: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.
	; CHECK: Cost for VF vscale x 8: 6 (Estimated cost per lane: 0.
	; CHECK: Cost for VF vscale x 16: 6 (Estimated cost per lane: 0.
	; CHECK: LV: Selecting VF: vscale x 16.
	entry:
	br label %loop

	loop:
	%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
	%gep.a = getelementptr i8, ptr %a, i64 %iv
	%load.a = load i8, ptr %gep.a, align 1
	%gep.b = getelementptr i8, ptr %b, i64 %iv
	%load.b = load i8, ptr %gep.b, align 1
	%add = add i8 %load.a, %load.b
	store i8 %add, ptr %gep.a, align 1
	%iv.next = add i64 %iv, 1
	%exitcond = icmp eq i64 %iv.next, 1024
	br i1 %exitcond, label %exit, label %loop

	exit:
	ret void
	}

	; With reduced number of registers the spills from high pressure are enough that
	; we use the same VF as if we hadn't maximized the bandwidth.
	define void @high_pressure(ptr %a, ptr %b) #0 {
	; CHECK-LABEL: LV: Checking a loop in 'high_pressure'
	;
	; CHECK-NOMAX: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.
	; CHECK-NOMAX: LV: Selecting VF: vscale x 4.
	;
	; CHECK-REGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.
	; CHECK-REGS-VP: Cost for VF vscale x 8: 10 (Estimated cost per lane: 1.
	; CHECK-REGS-VP: Cost for VF vscale x 16: 21 (Estimated cost per lane: 1.
	; CHECK-REGS-VP: LV: Selecting VF: vscale x 8.

	; CHECK-NOREGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.
	; CHECK-NOREGS-VP: LV(REG): Cost of 6 from 3 spills of Generic::VectorRC
	; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 8: 20 (Estimated cost per lane: 2.
	; CHECK-NOREGS-VP: LV(REG): Cost of 14 from 7 spills of Generic::VectorRC
	; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 16: 39 (Estimated cost per lane: 2.
	; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 4.
	entry:
	br label %loop

	loop:
	%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
	%gep.a = getelementptr i32, ptr %a, i64 %iv
	%load.a = load i32, ptr %gep.a, align 4
	%gep.b = getelementptr i8, ptr %b, i64 %iv
	%load.b = load i8, ptr %gep.b, align 1
	%ext.b = zext i8 %load.b to i32
	%add = add i32 %load.a, %ext.b
	store i32 %add, ptr %gep.a, align 4
	%iv.next = add i64 %iv, 1
	%exitcond = icmp eq i64 %iv.next, 1024
	br i1 %exitcond, label %exit, label %loop

	exit:
	ret void
	}

	attributes #0 = { vscale_range(1,16) "target-features"="+sve" }