[SelectionDAG] Don't promote the alignment of allocas beyond the stack alignment.
allocas in LLVM IR have a specified alignment. When that alignment is
specified, the alloca has at least that alignment at runtime.
If the specified type of the alloca has a higher preferred alignment,
SelectionDAG currently ignores that specified alignment, and increases
the alignment. It does this even if it would trigger stack realignment.
I don't think this makes sense, so this patch changes that.
I was looking into this for SVE in particular: for SVE, overaligning
vscale'ed types is extra expensive because it requires realigning the
stack multiple times, or using dynamic allocation. (This currently isn't
implemented.)
I updated the expected assembly for a couple tests; in particular, for
arg-copy-elide.ll, the optimization in question does not increase the
alignment the way SelectionDAG normally would. For the rest, I just
increased the specified alignment on the allocas to match what
SelectionDAG was inferring.
Differential Revision: https://reviews.llvm.org/D79532
diff --git a/llvm/test/CodeGen/X86/arg-copy-elide.ll b/llvm/test/CodeGen/X86/arg-copy-elide.ll
index 603e50f..705a356 100644
--- a/llvm/test/CodeGen/X86/arg-copy-elide.ll
+++ b/llvm/test/CodeGen/X86/arg-copy-elide.ll
@@ -53,22 +53,18 @@
}
; CHECK-LABEL: _split_i64:
-; CHECK: pushl %ebp
-; CHECK: movl %esp, %ebp
; CHECK: pushl %[[csr2:[^ ]*]]
; CHECK: pushl %[[csr1:[^ ]*]]
-; CHECK: andl $-8, %esp
-; CHECK-DAG: movl 8(%ebp), %[[csr1]]
-; CHECK-DAG: movl 12(%ebp), %[[csr2]]
-; CHECK-DAG: leal 8(%ebp), %[[reg:[^ ]*]]
+; CHECK-DAG: movl 12(%esp), %[[csr1]]
+; CHECK-DAG: movl 16(%esp), %[[csr2]]
+; CHECK-DAG: leal 12(%esp), %[[reg:[^ ]*]]
; CHECK: pushl %[[reg]]
; CHECK: calll _addrof_i64
+; CHECK: addl $4, %esp
; CHECK-DAG: movl %[[csr1]], %eax
; CHECK-DAG: movl %[[csr2]], %edx
-; CHECK: leal -8(%ebp), %esp
; CHECK: popl %[[csr1]]
; CHECK: popl %[[csr2]]
-; CHECK: popl %ebp
; CHECK: retl
define i1 @i1_arg(i1 %x) {
@@ -101,16 +97,13 @@
}
; CHECK-LABEL: _fastcc_split_i64:
-; CHECK: pushl %ebp
-; CHECK: movl %esp, %ebp
; CHECK-DAG: movl %edx, %[[r1:[^ ]*]]
-; CHECK-DAG: movl 8(%ebp), %[[r2:[^ ]*]]
+; CHECK-DAG: movl 20(%esp), %[[r2:[^ ]*]]
; CHECK-DAG: movl %[[r2]], 4(%esp)
; CHECK-DAG: movl %edx, (%esp)
; CHECK: movl %esp, %[[reg:[^ ]*]]
; CHECK: pushl %[[reg]]
; CHECK: calll _addrof_i64
-; CHECK: popl %ebp
; CHECK: retl
diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
index da77ee5..edae4f0 100644
--- a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -1164,9 +1164,9 @@
; X64-NEXT: vzeroupper
; X64-NEXT: retq
eintry:
- %__a.addr.i = alloca <4 x i64>, align 16
- %__b.addr.i = alloca <4 x i64>, align 16
- %vCr = alloca <4 x i64>, align 16
+ %__a.addr.i = alloca <4 x i64>, align 32
+ %__b.addr.i = alloca <4 x i64>, align 32
+ %vCr = alloca <4 x i64>, align 32
store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
%tmp = load <4 x i64>, <4 x i64>* %vCr, align 16
%tmp2 = load i8, i8* %cV_R.addr, align 4
@@ -1255,9 +1255,9 @@
; X64-NEXT: vzeroupper
; X64-NEXT: retq
eintry:
- %__a.addr.i = alloca <4 x i64>, align 16
- %__b.addr.i = alloca <4 x i64>, align 16
- %vCr = alloca <4 x i64>, align 16
+ %__a.addr.i = alloca <4 x i64>, align 32
+ %__b.addr.i = alloca <4 x i64>, align 32
+ %vCr = alloca <4 x i64>, align 32
store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
%tmp = load <4 x i64>, <4 x i64>* %vCr, align 16
%tmp2 = load i16, i16* %cV_R.addr, align 4
@@ -1346,9 +1346,9 @@
; X64-NEXT: vzeroupper
; X64-NEXT: retq
eintry:
- %__a.addr.i = alloca <4 x i64>, align 16
- %__b.addr.i = alloca <4 x i64>, align 16
- %vCr = alloca <4 x i64>, align 16
+ %__a.addr.i = alloca <4 x i64>, align 32
+ %__b.addr.i = alloca <4 x i64>, align 32
+ %vCr = alloca <4 x i64>, align 32
store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
%tmp = load <4 x i64>, <4 x i64>* %vCr, align 16
%tmp2 = load i32, i32* %cV_R.addr, align 4
@@ -1436,9 +1436,9 @@
; X64-NEXT: vzeroupper
; X64-NEXT: retq
eintry:
- %__a.addr.i = alloca <4 x i64>, align 16
- %__b.addr.i = alloca <4 x i64>, align 16
- %vCr = alloca <4 x i64>, align 16
+ %__a.addr.i = alloca <4 x i64>, align 32
+ %__b.addr.i = alloca <4 x i64>, align 32
+ %vCr = alloca <4 x i64>, align 32
store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
%tmp = load <4 x i64>, <4 x i64>* %vCr, align 16
%tmp2 = load i64, i64* %cV_R.addr, align 4
diff --git a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll
index 232b598..b928541 100644
--- a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll
+++ b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll
@@ -77,7 +77,7 @@
; X64-NEXT: popq %r13
; X64-NEXT: popq %rbp
; X64-NEXT: retq
- %y = alloca <16 x float>, align 16
+ %y = alloca <16 x float>, align 64
%x = fadd <16 x float> %a, %b
%1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
%2 = load <16 x float>, <16 x float>* %y, align 16
@@ -158,7 +158,7 @@
; X64-NEXT: popq %r13
; X64-NEXT: popq %rbp
; X64-NEXT: retq
- %y = alloca <16 x float>, align 16
+ %y = alloca <16 x float>, align 64
%x = fadd <16 x float> %a, %b
%1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
%2 = load <16 x float>, <16 x float>* %y, align 16
diff --git a/llvm/test/CodeGen/X86/load-local-v3i129.ll b/llvm/test/CodeGen/X86/load-local-v3i129.ll
index 00b53ef..8484f66 100644
--- a/llvm/test/CodeGen/X86/load-local-v3i129.ll
+++ b/llvm/test/CodeGen/X86/load-local-v3i129.ll
@@ -4,26 +4,20 @@
define void @_start() {
; CHECK-LABEL: _start:
; CHECK: # %bb.0: # %Entry
-; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset %rbp, -16
-; CHECK-NEXT: movq %rsp, %rbp
-; CHECK-NEXT: .cfi_def_cfa_register %rbp
-; CHECK-NEXT: andq $-128, %rsp
-; CHECK-NEXT: subq $256, %rsp # imm = 0x100
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
; CHECK-NEXT: shrdq $2, %rcx, %rax
; CHECK-NEXT: shrq $2, %rcx
; CHECK-NEXT: leaq 1(,%rax,4), %rdx
-; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: shrdq $62, %rcx, %rax
-; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: orq $-2, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq $-1, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %rbp, %rsp
-; CHECK-NEXT: popq %rbp
-; CHECK-NEXT: .cfi_def_cfa %rsp, 8
+; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: orq $-2, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq $-1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
Entry:
%y = alloca <3 x i129>, align 4
diff --git a/llvm/test/CodeGen/X86/movtopush.ll b/llvm/test/CodeGen/X86/movtopush.ll
index 7c067be..9f301a2 100644
--- a/llvm/test/CodeGen/X86/movtopush.ll
+++ b/llvm/test/CodeGen/X86/movtopush.ll
@@ -246,7 +246,7 @@
entry:
%p = alloca i32, align 4
%q = alloca i32, align 4
- %s = alloca %struct.s, align 4
+ %s = alloca %struct.s, align 8
call void @good(i32 1, i32 2, i32 3, i32 4)
%pv = ptrtoint i32* %p to i32
%qv = ptrtoint i32* %q to i32
@@ -407,7 +407,7 @@
define void @test14(%struct.A* %a) {
entry:
%ref.tmp = alloca %struct.B, align 1
- %agg.tmp = alloca i64, align 4
+ %agg.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %agg.tmp to %struct.A*
%tmp = alloca %struct.B, align 1
%0 = bitcast %struct.A* %a to i64*