test/CodeGen/RISCV/rvv/stack-probing-rvv.ll - llvm-project/llvm - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+v -O2 < %s \
 ; RUN:   | FileCheck %s -check-prefix=RV64IV
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+v -O2 < %s \
 ; RUN:   | FileCheck %s -check-prefix=RV32IV

 ; Tests adapted from AArch64.

 ; Test prolog sequences for stack probing when vector is involved.

 ; The space for vector objects needs probing in the general case, because
 ; the stack adjustment may happen to be too big (i.e. greater than the
 ; probe size).

 define void @f_vector(ptr %out) #0 {
 ; RV64IV-LABEL: f_vector:
 ; RV64IV:       # %bb.0: # %entry
 ; RV64IV-NEXT:    csrr t1, vlenb
 ; RV64IV-NEXT:    slli t1, t1, 1
 ; RV64IV-NEXT:    .cfi_def_cfa t1, -16
 ; RV64IV-NEXT:    lui t2, 1
 ; RV64IV-NEXT:  .LBB0_1: # %entry
 ; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64IV-NEXT:    sub sp, sp, t2
 ; RV64IV-NEXT:    sd zero, 0(sp)
 ; RV64IV-NEXT:    sub t1, t1, t2
 ; RV64IV-NEXT:    bge t1, t2, .LBB0_1
 ; RV64IV-NEXT:  # %bb.2: # %entry
 ; RV64IV-NEXT:    .cfi_def_cfa_register sp
 ; RV64IV-NEXT:    sub sp, sp, t1
 ; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 2 * vlenb
 ; RV64IV-NEXT:    csrr a0, vlenb
 ; RV64IV-NEXT:    slli a0, a0, 1
 ; RV64IV-NEXT:    add sp, sp, a0
 ; RV64IV-NEXT:    .cfi_def_cfa sp, 0
 ; RV64IV-NEXT:    ret
 ;
 ; RV32IV-LABEL: f_vector:
 ; RV32IV:       # %bb.0: # %entry
 ; RV32IV-NEXT:    csrr t1, vlenb
 ; RV32IV-NEXT:    slli t1, t1, 1
 ; RV32IV-NEXT:    .cfi_def_cfa t1, -16
 ; RV32IV-NEXT:    lui t2, 1
 ; RV32IV-NEXT:  .LBB0_1: # %entry
 ; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32IV-NEXT:    sub sp, sp, t2
 ; RV32IV-NEXT:    sw zero, 0(sp)
 ; RV32IV-NEXT:    sub t1, t1, t2
 ; RV32IV-NEXT:    bge t1, t2, .LBB0_1
 ; RV32IV-NEXT:  # %bb.2: # %entry
 ; RV32IV-NEXT:    .cfi_def_cfa_register sp
 ; RV32IV-NEXT:    sub sp, sp, t1
 ; RV32IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 2 * vlenb
 ; RV32IV-NEXT:    csrr a0, vlenb
 ; RV32IV-NEXT:    slli a0, a0, 1
 ; RV32IV-NEXT:    add sp, sp, a0
 ; RV32IV-NEXT:    .cfi_def_cfa sp, 0
 ; RV32IV-NEXT:    ret
 entry:
   %vec = alloca <vscale x 4 x float>, align 16
   ret void
 }

 ; As above, but with 4 vectors of stack space.
 define void @f4_vector(ptr %out) #0 {
 ; RV64IV-LABEL: f4_vector:
 ; RV64IV:       # %bb.0: # %entry
 ; RV64IV-NEXT:    csrr t1, vlenb
 ; RV64IV-NEXT:    slli t1, t1, 3
 ; RV64IV-NEXT:    .cfi_def_cfa t1, -64
 ; RV64IV-NEXT:    lui t2, 1
 ; RV64IV-NEXT:  .LBB1_1: # %entry
 ; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64IV-NEXT:    sub sp, sp, t2
 ; RV64IV-NEXT:    sd zero, 0(sp)
 ; RV64IV-NEXT:    sub t1, t1, t2
 ; RV64IV-NEXT:    bge t1, t2, .LBB1_1
 ; RV64IV-NEXT:  # %bb.2: # %entry
 ; RV64IV-NEXT:    .cfi_def_cfa_register sp
 ; RV64IV-NEXT:    sub sp, sp, t1
 ; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 8 * vlenb
 ; RV64IV-NEXT:    csrr a0, vlenb
 ; RV64IV-NEXT:    slli a0, a0, 3
 ; RV64IV-NEXT:    add sp, sp, a0
 ; RV64IV-NEXT:    .cfi_def_cfa sp, 0
 ; RV64IV-NEXT:    ret
 ;
 ; RV32IV-LABEL: f4_vector:
 ; RV32IV:       # %bb.0: # %entry
 ; RV32IV-NEXT:    csrr t1, vlenb
 ; RV32IV-NEXT:    slli t1, t1, 3
 ; RV32IV-NEXT:    .cfi_def_cfa t1, -64
 ; RV32IV-NEXT:    lui t2, 1
 ; RV32IV-NEXT:  .LBB1_1: # %entry
 ; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32IV-NEXT:    sub sp, sp, t2
 ; RV32IV-NEXT:    sw zero, 0(sp)
 ; RV32IV-NEXT:    sub t1, t1, t2
 ; RV32IV-NEXT:    bge t1, t2, .LBB1_1
 ; RV32IV-NEXT:  # %bb.2: # %entry
 ; RV32IV-NEXT:    .cfi_def_cfa_register sp
 ; RV32IV-NEXT:    sub sp, sp, t1
 ; RV32IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 8 * vlenb
 ; RV32IV-NEXT:    csrr a0, vlenb
 ; RV32IV-NEXT:    slli a0, a0, 3
 ; RV32IV-NEXT:    add sp, sp, a0
 ; RV32IV-NEXT:    .cfi_def_cfa sp, 0
 ; RV32IV-NEXT:    ret
 entry:
   %vec1 = alloca <vscale x 4 x float>, align 16
   %vec2 = alloca <vscale x 4 x float>, align 16
   %vec3 = alloca <vscale x 4 x float>, align 16
   %vec4 = alloca <vscale x 4 x float>, align 16
   ret void
 }

 ; As above, but with 16 vectors of stack space.
 ; The stack adjustment is less than or equal to 16 x 256 = 4096, so
 ; we can allocate the locals at once.
 define void @f16_vector(ptr %out) #0 {
 ; RV64IV-LABEL: f16_vector:
 ; RV64IV:       # %bb.0: # %entry
 ; RV64IV-NEXT:    csrr t1, vlenb
 ; RV64IV-NEXT:    slli t1, t1, 5
 ; RV64IV-NEXT:    .cfi_def_cfa t1, -256
 ; RV64IV-NEXT:    lui t2, 1
 ; RV64IV-NEXT:  .LBB2_1: # %entry
 ; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64IV-NEXT:    sub sp, sp, t2
 ; RV64IV-NEXT:    sd zero, 0(sp)
 ; RV64IV-NEXT:    sub t1, t1, t2
 ; RV64IV-NEXT:    bge t1, t2, .LBB2_1
 ; RV64IV-NEXT:  # %bb.2: # %entry
 ; RV64IV-NEXT:    .cfi_def_cfa_register sp
 ; RV64IV-NEXT:    sub sp, sp, t1
 ; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 * vlenb
 ; RV64IV-NEXT:    csrr a0, vlenb
 ; RV64IV-NEXT:    slli a0, a0, 5
 ; RV64IV-NEXT:    add sp, sp, a0
 ; RV64IV-NEXT:    .cfi_def_cfa sp, 0
 ; RV64IV-NEXT:    ret
 ;
 ; RV32IV-LABEL: f16_vector:
 ; RV32IV:       # %bb.0: # %entry
 ; RV32IV-NEXT:    csrr t1, vlenb
 ; RV32IV-NEXT:    slli t1, t1, 5
 ; RV32IV-NEXT:    .cfi_def_cfa t1, -256
 ; RV32IV-NEXT:    lui t2, 1
 ; RV32IV-NEXT:  .LBB2_1: # %entry
 ; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32IV-NEXT:    sub sp, sp, t2
 ; RV32IV-NEXT:    sw zero, 0(sp)
 ; RV32IV-NEXT:    sub t1, t1, t2
 ; RV32IV-NEXT:    bge t1, t2, .LBB2_1
 ; RV32IV-NEXT:  # %bb.2: # %entry
 ; RV32IV-NEXT:    .cfi_def_cfa_register sp
 ; RV32IV-NEXT:    sub sp, sp, t1
 ; RV32IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 * vlenb
 ; RV32IV-NEXT:    csrr a0, vlenb
 ; RV32IV-NEXT:    slli a0, a0, 5
 ; RV32IV-NEXT:    add sp, sp, a0
 ; RV32IV-NEXT:    .cfi_def_cfa sp, 0
 ; RV32IV-NEXT:    ret
 entry:
   %vec1 = alloca <vscale x 4 x float>, align 16
   %vec2 = alloca <vscale x 4 x float>, align 16
   %vec3 = alloca <vscale x 4 x float>, align 16
   %vec4 = alloca <vscale x 4 x float>, align 16
   %vec5 = alloca <vscale x 4 x float>, align 16
   %vec6 = alloca <vscale x 4 x float>, align 16
   %vec7 = alloca <vscale x 4 x float>, align 16
   %vec8 = alloca <vscale x 4 x float>, align 16
   %vec9 = alloca <vscale x 4 x float>, align 16
   %vec10 = alloca <vscale x 4 x float>, align 16
   %vec11 = alloca <vscale x 4 x float>, align 16
   %vec12 = alloca <vscale x 4 x float>, align 16
   %vec13 = alloca <vscale x 4 x float>, align 16
   %vec14 = alloca <vscale x 4 x float>, align 16
   %vec15 = alloca <vscale x 4 x float>, align 16
   %vec16 = alloca <vscale x 4 x float>, align 16
   ret void
 }

 ; As above, but with 17 vectors of stack space.
 define void @f17_vector(ptr %out) #0 {
 ; RV64IV-LABEL: f17_vector:
 ; RV64IV:       # %bb.0: # %entry
 ; RV64IV-NEXT:    csrr t1, vlenb
 ; RV64IV-NEXT:    li a0, 34
 ; RV64IV-NEXT:    mul t1, t1, a0
 ; RV64IV-NEXT:    .cfi_def_cfa t1, -272
 ; RV64IV-NEXT:    lui t2, 1
 ; RV64IV-NEXT:  .LBB3_1: # %entry
 ; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64IV-NEXT:    sub sp, sp, t2
 ; RV64IV-NEXT:    sd zero, 0(sp)
 ; RV64IV-NEXT:    sub t1, t1, t2
 ; RV64IV-NEXT:    bge t1, t2, .LBB3_1
 ; RV64IV-NEXT:  # %bb.2: # %entry
 ; RV64IV-NEXT:    .cfi_def_cfa_register sp
 ; RV64IV-NEXT:    sub sp, sp, t1
 ; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 34 * vlenb
 ; RV64IV-NEXT:    csrr a0, vlenb
 ; RV64IV-NEXT:    li a1, 34
 ; RV64IV-NEXT:    mul a0, a0, a1
 ; RV64IV-NEXT:    add sp, sp, a0
 ; RV64IV-NEXT:    .cfi_def_cfa sp, 0
 ; RV64IV-NEXT:    ret
 ;
 ; RV32IV-LABEL: f17_vector:
 ; RV32IV:       # %bb.0: # %entry
 ; RV32IV-NEXT:    csrr t1, vlenb
 ; RV32IV-NEXT:    li a0, 34
 ; RV32IV-NEXT:    mul t1, t1, a0
 ; RV32IV-NEXT:    .cfi_def_cfa t1, -272
 ; RV32IV-NEXT:    lui t2, 1
 ; RV32IV-NEXT:  .LBB3_1: # %entry
 ; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32IV-NEXT:    sub sp, sp, t2
 ; RV32IV-NEXT:    sw zero, 0(sp)
 ; RV32IV-NEXT:    sub t1, t1, t2
 ; RV32IV-NEXT:    bge t1, t2, .LBB3_1
 ; RV32IV-NEXT:  # %bb.2: # %entry
 ; RV32IV-NEXT:    .cfi_def_cfa_register sp
 ; RV32IV-NEXT:    sub sp, sp, t1
 ; RV32IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 34 * vlenb
 ; RV32IV-NEXT:    csrr a0, vlenb
 ; RV32IV-NEXT:    li a1, 34
 ; RV32IV-NEXT:    mul a0, a0, a1
 ; RV32IV-NEXT:    add sp, sp, a0
 ; RV32IV-NEXT:    .cfi_def_cfa sp, 0
 ; RV32IV-NEXT:    ret
 entry:
   %vec1 = alloca <vscale x 4 x float>, align 16
   %vec2 = alloca <vscale x 4 x float>, align 16
   %vec3 = alloca <vscale x 4 x float>, align 16
   %vec4 = alloca <vscale x 4 x float>, align 16
   %vec5 = alloca <vscale x 4 x float>, align 16
   %vec6 = alloca <vscale x 4 x float>, align 16
   %vec7 = alloca <vscale x 4 x float>, align 16
   %vec8 = alloca <vscale x 4 x float>, align 16
   %vec9 = alloca <vscale x 4 x float>, align 16
   %vec10 = alloca <vscale x 4 x float>, align 16
   %vec11 = alloca <vscale x 4 x float>, align 16
   %vec12 = alloca <vscale x 4 x float>, align 16
   %vec13 = alloca <vscale x 4 x float>, align 16
   %vec14 = alloca <vscale x 4 x float>, align 16
   %vec15 = alloca <vscale x 4 x float>, align 16
   %vec16 = alloca <vscale x 4 x float>, align 16
   %vec17 = alloca <vscale x 4 x float>, align 16
   ret void
 }

 ; A vector and a 16-byte fixed size object.
 define void @f1_vector_16_arr(ptr %out) #0 {
 ; RV64IV-LABEL: f1_vector_16_arr:
 ; RV64IV:       # %bb.0: # %entry
 ; RV64IV-NEXT:    addi sp, sp, -16
 ; RV64IV-NEXT:    .cfi_def_cfa_offset 16
 ; RV64IV-NEXT:    csrr t1, vlenb
 ; RV64IV-NEXT:    slli t1, t1, 1
 ; RV64IV-NEXT:    .cfi_def_cfa t1, -16
 ; RV64IV-NEXT:    lui t2, 1
 ; RV64IV-NEXT:  .LBB4_1: # %entry
 ; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64IV-NEXT:    sub sp, sp, t2
 ; RV64IV-NEXT:    sd zero, 0(sp)
 ; RV64IV-NEXT:    sub t1, t1, t2
 ; RV64IV-NEXT:    bge t1, t2, .LBB4_1
 ; RV64IV-NEXT:  # %bb.2: # %entry
 ; RV64IV-NEXT:    .cfi_def_cfa_register sp
 ; RV64IV-NEXT:    sub sp, sp, t1
 ; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; RV64IV-NEXT:    csrr a0, vlenb
 ; RV64IV-NEXT:    slli a0, a0, 1
 ; RV64IV-NEXT:    add sp, sp, a0
 ; RV64IV-NEXT:    .cfi_def_cfa sp, 16
 ; RV64IV-NEXT:    addi sp, sp, 16
 ; RV64IV-NEXT:    .cfi_def_cfa_offset 0
 ; RV64IV-NEXT:    ret
 ;
 ; RV32IV-LABEL: f1_vector_16_arr:
 ; RV32IV:       # %bb.0: # %entry
 ; RV32IV-NEXT:    addi sp, sp, -16
 ; RV32IV-NEXT:    .cfi_def_cfa_offset 16
 ; RV32IV-NEXT:    csrr t1, vlenb
 ; RV32IV-NEXT:    slli t1, t1, 1
 ; RV32IV-NEXT:    .cfi_def_cfa t1, -16
 ; RV32IV-NEXT:    lui t2, 1
 ; RV32IV-NEXT:  .LBB4_1: # %entry
 ; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32IV-NEXT:    sub sp, sp, t2
 ; RV32IV-NEXT:    sw zero, 0(sp)
 ; RV32IV-NEXT:    sub t1, t1, t2
 ; RV32IV-NEXT:    bge t1, t2, .LBB4_1
 ; RV32IV-NEXT:  # %bb.2: # %entry
 ; RV32IV-NEXT:    .cfi_def_cfa_register sp
 ; RV32IV-NEXT:    sub sp, sp, t1
 ; RV32IV-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; RV32IV-NEXT:    csrr a0, vlenb
 ; RV32IV-NEXT:    slli a0, a0, 1
 ; RV32IV-NEXT:    add sp, sp, a0
 ; RV32IV-NEXT:    .cfi_def_cfa sp, 16
 ; RV32IV-NEXT:    addi sp, sp, 16
 ; RV32IV-NEXT:    .cfi_def_cfa_offset 0
 ; RV32IV-NEXT:    ret
 entry:
   %vec = alloca <vscale x 4 x float>, align 16
   %arr = alloca i8, i64 16, align 1
   ret void
 }

 ; A large vector object and a large slot, both of which need probing.
 define void @f1_vector_4096_arr(ptr %out) #0 {
 ; RV64IV-LABEL: f1_vector_4096_arr:
 ; RV64IV:       # %bb.0: # %entry
 ; RV64IV-NEXT:    lui a0, 1
 ; RV64IV-NEXT:    sub sp, sp, a0
 ; RV64IV-NEXT:    sd zero, 0(sp)
 ; RV64IV-NEXT:    .cfi_def_cfa_offset 4096
 ; RV64IV-NEXT:    lui a0, 1
 ; RV64IV-NEXT:    sub sp, sp, a0
 ; RV64IV-NEXT:    sd zero, 0(sp)
 ; RV64IV-NEXT:    .cfi_def_cfa_offset 8192
 ; RV64IV-NEXT:    lui a0, 1
 ; RV64IV-NEXT:    sub sp, sp, a0
 ; RV64IV-NEXT:    sd zero, 0(sp)
 ; RV64IV-NEXT:    .cfi_def_cfa_offset 12288
 ; RV64IV-NEXT:    addi sp, sp, -16
 ; RV64IV-NEXT:    .cfi_def_cfa_offset 12304
 ; RV64IV-NEXT:    csrr t1, vlenb
 ; RV64IV-NEXT:    slli t1, t1, 7
 ; RV64IV-NEXT:    .cfi_def_cfa t1, -1024
 ; RV64IV-NEXT:    lui t2, 1
 ; RV64IV-NEXT:  .LBB5_1: # %entry
 ; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64IV-NEXT:    sub sp, sp, t2
 ; RV64IV-NEXT:    sd zero, 0(sp)
 ; RV64IV-NEXT:    sub t1, t1, t2
 ; RV64IV-NEXT:    bge t1, t2, .LBB5_1
 ; RV64IV-NEXT:  # %bb.2: # %entry
 ; RV64IV-NEXT:    .cfi_def_cfa_register sp
 ; RV64IV-NEXT:    sub sp, sp, t1
 ; RV64IV-NEXT:    .cfi_escape 0x0f, 0x10, 0x72, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 12304 + 128 * vlenb
 ; RV64IV-NEXT:    csrr a0, vlenb
 ; RV64IV-NEXT:    slli a0, a0, 7
 ; RV64IV-NEXT:    add sp, sp, a0
 ; RV64IV-NEXT:    .cfi_def_cfa sp, 12304
 ; RV64IV-NEXT:    lui a0, 3
 ; RV64IV-NEXT:    addi a0, a0, 16
 ; RV64IV-NEXT:    add sp, sp, a0
 ; RV64IV-NEXT:    .cfi_def_cfa_offset 0
 ; RV64IV-NEXT:    ret
 ;
 ; RV32IV-LABEL: f1_vector_4096_arr:
 ; RV32IV:       # %bb.0: # %entry
 ; RV32IV-NEXT:    lui a0, 1
 ; RV32IV-NEXT:    sub sp, sp, a0
 ; RV32IV-NEXT:    sw zero, 0(sp)
 ; RV32IV-NEXT:    .cfi_def_cfa_offset 4096
 ; RV32IV-NEXT:    lui a0, 1
 ; RV32IV-NEXT:    sub sp, sp, a0
 ; RV32IV-NEXT:    sw zero, 0(sp)
 ; RV32IV-NEXT:    .cfi_def_cfa_offset 8192
 ; RV32IV-NEXT:    lui a0, 1
 ; RV32IV-NEXT:    sub sp, sp, a0
 ; RV32IV-NEXT:    sw zero, 0(sp)
 ; RV32IV-NEXT:    .cfi_def_cfa_offset 12288
 ; RV32IV-NEXT:    addi sp, sp, -16
 ; RV32IV-NEXT:    .cfi_def_cfa_offset 12304
 ; RV32IV-NEXT:    csrr t1, vlenb
 ; RV32IV-NEXT:    slli t1, t1, 7
 ; RV32IV-NEXT:    .cfi_def_cfa t1, -1024
 ; RV32IV-NEXT:    lui t2, 1
 ; RV32IV-NEXT:  .LBB5_1: # %entry
 ; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32IV-NEXT:    sub sp, sp, t2
 ; RV32IV-NEXT:    sw zero, 0(sp)
 ; RV32IV-NEXT:    sub t1, t1, t2
 ; RV32IV-NEXT:    bge t1, t2, .LBB5_1
 ; RV32IV-NEXT:  # %bb.2: # %entry
 ; RV32IV-NEXT:    .cfi_def_cfa_register sp
 ; RV32IV-NEXT:    sub sp, sp, t1
 ; RV32IV-NEXT:    .cfi_escape 0x0f, 0x10, 0x72, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 12304 + 128 * vlenb
 ; RV32IV-NEXT:    csrr a0, vlenb
 ; RV32IV-NEXT:    slli a0, a0, 7
 ; RV32IV-NEXT:    add sp, sp, a0
 ; RV32IV-NEXT:    .cfi_def_cfa sp, 12304
 ; RV32IV-NEXT:    lui a0, 3
 ; RV32IV-NEXT:    addi a0, a0, 16
 ; RV32IV-NEXT:    add sp, sp, a0
 ; RV32IV-NEXT:    .cfi_def_cfa_offset 0
 ; RV32IV-NEXT:    ret
 entry:
   %vec = alloca <vscale x 256 x float>, align 16
   %arr = alloca i8, i64 12288, align 1
   ret void
 }

 attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -mtriple=riscv64 -mattr=+m,+v -O2 < %s \
	; RUN: \| FileCheck %s -check-prefix=RV64IV
	; RUN: llc -mtriple=riscv32 -mattr=+m,+v -O2 < %s \
	; RUN: \| FileCheck %s -check-prefix=RV32IV

	; Tests adapted from AArch64.

	; Test prolog sequences for stack probing when vector is involved.

	; The space for vector objects needs probing in the general case, because
	; the stack adjustment may happen to be too big (i.e. greater than the
	; probe size).

	define void @f_vector(ptr %out) #0 {
	; RV64IV-LABEL: f_vector:
	; RV64IV: # %bb.0: # %entry
	; RV64IV-NEXT: csrr t1, vlenb
	; RV64IV-NEXT: slli t1, t1, 1
	; RV64IV-NEXT: .cfi_def_cfa t1, -16
	; RV64IV-NEXT: lui t2, 1
	; RV64IV-NEXT: .LBB0_1: # %entry
	; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1
	; RV64IV-NEXT: sub sp, sp, t2
	; RV64IV-NEXT: sd zero, 0(sp)
	; RV64IV-NEXT: sub t1, t1, t2
	; RV64IV-NEXT: bge t1, t2, .LBB0_1
	; RV64IV-NEXT: # %bb.2: # %entry
	; RV64IV-NEXT: .cfi_def_cfa_register sp
	; RV64IV-NEXT: sub sp, sp, t1
	; RV64IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 2 * vlenb
	; RV64IV-NEXT: csrr a0, vlenb
	; RV64IV-NEXT: slli a0, a0, 1
	; RV64IV-NEXT: add sp, sp, a0
	; RV64IV-NEXT: .cfi_def_cfa sp, 0
	; RV64IV-NEXT: ret
	;
	; RV32IV-LABEL: f_vector:
	; RV32IV: # %bb.0: # %entry
	; RV32IV-NEXT: csrr t1, vlenb
	; RV32IV-NEXT: slli t1, t1, 1
	; RV32IV-NEXT: .cfi_def_cfa t1, -16
	; RV32IV-NEXT: lui t2, 1
	; RV32IV-NEXT: .LBB0_1: # %entry
	; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1
	; RV32IV-NEXT: sub sp, sp, t2
	; RV32IV-NEXT: sw zero, 0(sp)
	; RV32IV-NEXT: sub t1, t1, t2
	; RV32IV-NEXT: bge t1, t2, .LBB0_1
	; RV32IV-NEXT: # %bb.2: # %entry
	; RV32IV-NEXT: .cfi_def_cfa_register sp
	; RV32IV-NEXT: sub sp, sp, t1
	; RV32IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 2 * vlenb
	; RV32IV-NEXT: csrr a0, vlenb
	; RV32IV-NEXT: slli a0, a0, 1
	; RV32IV-NEXT: add sp, sp, a0
	; RV32IV-NEXT: .cfi_def_cfa sp, 0
	; RV32IV-NEXT: ret
	entry:
	%vec = alloca <vscale x 4 x float>, align 16
	ret void
	}

	; As above, but with 4 vectors of stack space.
	define void @f4_vector(ptr %out) #0 {
	; RV64IV-LABEL: f4_vector:
	; RV64IV: # %bb.0: # %entry
	; RV64IV-NEXT: csrr t1, vlenb
	; RV64IV-NEXT: slli t1, t1, 3
	; RV64IV-NEXT: .cfi_def_cfa t1, -64
	; RV64IV-NEXT: lui t2, 1
	; RV64IV-NEXT: .LBB1_1: # %entry
	; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1
	; RV64IV-NEXT: sub sp, sp, t2
	; RV64IV-NEXT: sd zero, 0(sp)
	; RV64IV-NEXT: sub t1, t1, t2
	; RV64IV-NEXT: bge t1, t2, .LBB1_1
	; RV64IV-NEXT: # %bb.2: # %entry
	; RV64IV-NEXT: .cfi_def_cfa_register sp
	; RV64IV-NEXT: sub sp, sp, t1
	; RV64IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 8 * vlenb
	; RV64IV-NEXT: csrr a0, vlenb
	; RV64IV-NEXT: slli a0, a0, 3
	; RV64IV-NEXT: add sp, sp, a0
	; RV64IV-NEXT: .cfi_def_cfa sp, 0
	; RV64IV-NEXT: ret
	;
	; RV32IV-LABEL: f4_vector:
	; RV32IV: # %bb.0: # %entry
	; RV32IV-NEXT: csrr t1, vlenb
	; RV32IV-NEXT: slli t1, t1, 3
	; RV32IV-NEXT: .cfi_def_cfa t1, -64
	; RV32IV-NEXT: lui t2, 1
	; RV32IV-NEXT: .LBB1_1: # %entry
	; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1
	; RV32IV-NEXT: sub sp, sp, t2
	; RV32IV-NEXT: sw zero, 0(sp)
	; RV32IV-NEXT: sub t1, t1, t2
	; RV32IV-NEXT: bge t1, t2, .LBB1_1
	; RV32IV-NEXT: # %bb.2: # %entry
	; RV32IV-NEXT: .cfi_def_cfa_register sp
	; RV32IV-NEXT: sub sp, sp, t1
	; RV32IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 8 * vlenb
	; RV32IV-NEXT: csrr a0, vlenb
	; RV32IV-NEXT: slli a0, a0, 3
	; RV32IV-NEXT: add sp, sp, a0
	; RV32IV-NEXT: .cfi_def_cfa sp, 0
	; RV32IV-NEXT: ret
	entry:
	%vec1 = alloca <vscale x 4 x float>, align 16
	%vec2 = alloca <vscale x 4 x float>, align 16
	%vec3 = alloca <vscale x 4 x float>, align 16
	%vec4 = alloca <vscale x 4 x float>, align 16
	ret void
	}

	; As above, but with 16 vectors of stack space.
	; The stack adjustment is less than or equal to 16 x 256 = 4096, so
	; we can allocate the locals at once.
	define void @f16_vector(ptr %out) #0 {
	; RV64IV-LABEL: f16_vector:
	; RV64IV: # %bb.0: # %entry
	; RV64IV-NEXT: csrr t1, vlenb
	; RV64IV-NEXT: slli t1, t1, 5
	; RV64IV-NEXT: .cfi_def_cfa t1, -256
	; RV64IV-NEXT: lui t2, 1
	; RV64IV-NEXT: .LBB2_1: # %entry
	; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1
	; RV64IV-NEXT: sub sp, sp, t2
	; RV64IV-NEXT: sd zero, 0(sp)
	; RV64IV-NEXT: sub t1, t1, t2
	; RV64IV-NEXT: bge t1, t2, .LBB2_1
	; RV64IV-NEXT: # %bb.2: # %entry
	; RV64IV-NEXT: .cfi_def_cfa_register sp
	; RV64IV-NEXT: sub sp, sp, t1
	; RV64IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 * vlenb
	; RV64IV-NEXT: csrr a0, vlenb
	; RV64IV-NEXT: slli a0, a0, 5
	; RV64IV-NEXT: add sp, sp, a0
	; RV64IV-NEXT: .cfi_def_cfa sp, 0
	; RV64IV-NEXT: ret
	;
	; RV32IV-LABEL: f16_vector:
	; RV32IV: # %bb.0: # %entry
	; RV32IV-NEXT: csrr t1, vlenb
	; RV32IV-NEXT: slli t1, t1, 5
	; RV32IV-NEXT: .cfi_def_cfa t1, -256
	; RV32IV-NEXT: lui t2, 1
	; RV32IV-NEXT: .LBB2_1: # %entry
	; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1
	; RV32IV-NEXT: sub sp, sp, t2
	; RV32IV-NEXT: sw zero, 0(sp)
	; RV32IV-NEXT: sub t1, t1, t2
	; RV32IV-NEXT: bge t1, t2, .LBB2_1
	; RV32IV-NEXT: # %bb.2: # %entry
	; RV32IV-NEXT: .cfi_def_cfa_register sp
	; RV32IV-NEXT: sub sp, sp, t1
	; RV32IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 * vlenb
	; RV32IV-NEXT: csrr a0, vlenb
	; RV32IV-NEXT: slli a0, a0, 5
	; RV32IV-NEXT: add sp, sp, a0
	; RV32IV-NEXT: .cfi_def_cfa sp, 0
	; RV32IV-NEXT: ret
	entry:
	%vec1 = alloca <vscale x 4 x float>, align 16
	%vec2 = alloca <vscale x 4 x float>, align 16
	%vec3 = alloca <vscale x 4 x float>, align 16
	%vec4 = alloca <vscale x 4 x float>, align 16
	%vec5 = alloca <vscale x 4 x float>, align 16
	%vec6 = alloca <vscale x 4 x float>, align 16
	%vec7 = alloca <vscale x 4 x float>, align 16
	%vec8 = alloca <vscale x 4 x float>, align 16
	%vec9 = alloca <vscale x 4 x float>, align 16
	%vec10 = alloca <vscale x 4 x float>, align 16
	%vec11 = alloca <vscale x 4 x float>, align 16
	%vec12 = alloca <vscale x 4 x float>, align 16
	%vec13 = alloca <vscale x 4 x float>, align 16
	%vec14 = alloca <vscale x 4 x float>, align 16
	%vec15 = alloca <vscale x 4 x float>, align 16
	%vec16 = alloca <vscale x 4 x float>, align 16
	ret void
	}

	; As above, but with 17 vectors of stack space.
	define void @f17_vector(ptr %out) #0 {
	; RV64IV-LABEL: f17_vector:
	; RV64IV: # %bb.0: # %entry
	; RV64IV-NEXT: csrr t1, vlenb
	; RV64IV-NEXT: li a0, 34
	; RV64IV-NEXT: mul t1, t1, a0
	; RV64IV-NEXT: .cfi_def_cfa t1, -272
	; RV64IV-NEXT: lui t2, 1
	; RV64IV-NEXT: .LBB3_1: # %entry
	; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1
	; RV64IV-NEXT: sub sp, sp, t2
	; RV64IV-NEXT: sd zero, 0(sp)
	; RV64IV-NEXT: sub t1, t1, t2
	; RV64IV-NEXT: bge t1, t2, .LBB3_1
	; RV64IV-NEXT: # %bb.2: # %entry
	; RV64IV-NEXT: .cfi_def_cfa_register sp
	; RV64IV-NEXT: sub sp, sp, t1
	; RV64IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 34 * vlenb
	; RV64IV-NEXT: csrr a0, vlenb
	; RV64IV-NEXT: li a1, 34
	; RV64IV-NEXT: mul a0, a0, a1
	; RV64IV-NEXT: add sp, sp, a0
	; RV64IV-NEXT: .cfi_def_cfa sp, 0
	; RV64IV-NEXT: ret
	;
	; RV32IV-LABEL: f17_vector:
	; RV32IV: # %bb.0: # %entry
	; RV32IV-NEXT: csrr t1, vlenb
	; RV32IV-NEXT: li a0, 34
	; RV32IV-NEXT: mul t1, t1, a0
	; RV32IV-NEXT: .cfi_def_cfa t1, -272
	; RV32IV-NEXT: lui t2, 1
	; RV32IV-NEXT: .LBB3_1: # %entry
	; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1
	; RV32IV-NEXT: sub sp, sp, t2
	; RV32IV-NEXT: sw zero, 0(sp)
	; RV32IV-NEXT: sub t1, t1, t2
	; RV32IV-NEXT: bge t1, t2, .LBB3_1
	; RV32IV-NEXT: # %bb.2: # %entry
	; RV32IV-NEXT: .cfi_def_cfa_register sp
	; RV32IV-NEXT: sub sp, sp, t1
	; RV32IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 34 * vlenb
	; RV32IV-NEXT: csrr a0, vlenb
	; RV32IV-NEXT: li a1, 34
	; RV32IV-NEXT: mul a0, a0, a1
	; RV32IV-NEXT: add sp, sp, a0
	; RV32IV-NEXT: .cfi_def_cfa sp, 0
	; RV32IV-NEXT: ret
	entry:
	%vec1 = alloca <vscale x 4 x float>, align 16
	%vec2 = alloca <vscale x 4 x float>, align 16
	%vec3 = alloca <vscale x 4 x float>, align 16
	%vec4 = alloca <vscale x 4 x float>, align 16
	%vec5 = alloca <vscale x 4 x float>, align 16
	%vec6 = alloca <vscale x 4 x float>, align 16
	%vec7 = alloca <vscale x 4 x float>, align 16
	%vec8 = alloca <vscale x 4 x float>, align 16
	%vec9 = alloca <vscale x 4 x float>, align 16
	%vec10 = alloca <vscale x 4 x float>, align 16
	%vec11 = alloca <vscale x 4 x float>, align 16
	%vec12 = alloca <vscale x 4 x float>, align 16
	%vec13 = alloca <vscale x 4 x float>, align 16
	%vec14 = alloca <vscale x 4 x float>, align 16
	%vec15 = alloca <vscale x 4 x float>, align 16
	%vec16 = alloca <vscale x 4 x float>, align 16
	%vec17 = alloca <vscale x 4 x float>, align 16
	ret void
	}

	; A vector and a 16-byte fixed size object.
	define void @f1_vector_16_arr(ptr %out) #0 {
	; RV64IV-LABEL: f1_vector_16_arr:
	; RV64IV: # %bb.0: # %entry
	; RV64IV-NEXT: addi sp, sp, -16
	; RV64IV-NEXT: .cfi_def_cfa_offset 16
	; RV64IV-NEXT: csrr t1, vlenb
	; RV64IV-NEXT: slli t1, t1, 1
	; RV64IV-NEXT: .cfi_def_cfa t1, -16
	; RV64IV-NEXT: lui t2, 1
	; RV64IV-NEXT: .LBB4_1: # %entry
	; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1
	; RV64IV-NEXT: sub sp, sp, t2
	; RV64IV-NEXT: sd zero, 0(sp)
	; RV64IV-NEXT: sub t1, t1, t2
	; RV64IV-NEXT: bge t1, t2, .LBB4_1
	; RV64IV-NEXT: # %bb.2: # %entry
	; RV64IV-NEXT: .cfi_def_cfa_register sp
	; RV64IV-NEXT: sub sp, sp, t1
	; RV64IV-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
	; RV64IV-NEXT: csrr a0, vlenb
	; RV64IV-NEXT: slli a0, a0, 1
	; RV64IV-NEXT: add sp, sp, a0
	; RV64IV-NEXT: .cfi_def_cfa sp, 16
	; RV64IV-NEXT: addi sp, sp, 16
	; RV64IV-NEXT: .cfi_def_cfa_offset 0
	; RV64IV-NEXT: ret
	;
	; RV32IV-LABEL: f1_vector_16_arr:
	; RV32IV: # %bb.0: # %entry
	; RV32IV-NEXT: addi sp, sp, -16
	; RV32IV-NEXT: .cfi_def_cfa_offset 16
	; RV32IV-NEXT: csrr t1, vlenb
	; RV32IV-NEXT: slli t1, t1, 1
	; RV32IV-NEXT: .cfi_def_cfa t1, -16
	; RV32IV-NEXT: lui t2, 1
	; RV32IV-NEXT: .LBB4_1: # %entry
	; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1
	; RV32IV-NEXT: sub sp, sp, t2
	; RV32IV-NEXT: sw zero, 0(sp)
	; RV32IV-NEXT: sub t1, t1, t2
	; RV32IV-NEXT: bge t1, t2, .LBB4_1
	; RV32IV-NEXT: # %bb.2: # %entry
	; RV32IV-NEXT: .cfi_def_cfa_register sp
	; RV32IV-NEXT: sub sp, sp, t1
	; RV32IV-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
	; RV32IV-NEXT: csrr a0, vlenb
	; RV32IV-NEXT: slli a0, a0, 1
	; RV32IV-NEXT: add sp, sp, a0
	; RV32IV-NEXT: .cfi_def_cfa sp, 16
	; RV32IV-NEXT: addi sp, sp, 16
	; RV32IV-NEXT: .cfi_def_cfa_offset 0
	; RV32IV-NEXT: ret
	entry:
	%vec = alloca <vscale x 4 x float>, align 16
	%arr = alloca i8, i64 16, align 1
	ret void
	}

	; A large vector object and a large slot, both of which need probing.
	define void @f1_vector_4096_arr(ptr %out) #0 {
	; RV64IV-LABEL: f1_vector_4096_arr:
	; RV64IV: # %bb.0: # %entry
	; RV64IV-NEXT: lui a0, 1
	; RV64IV-NEXT: sub sp, sp, a0
	; RV64IV-NEXT: sd zero, 0(sp)
	; RV64IV-NEXT: .cfi_def_cfa_offset 4096
	; RV64IV-NEXT: lui a0, 1
	; RV64IV-NEXT: sub sp, sp, a0
	; RV64IV-NEXT: sd zero, 0(sp)
	; RV64IV-NEXT: .cfi_def_cfa_offset 8192
	; RV64IV-NEXT: lui a0, 1
	; RV64IV-NEXT: sub sp, sp, a0
	; RV64IV-NEXT: sd zero, 0(sp)
	; RV64IV-NEXT: .cfi_def_cfa_offset 12288
	; RV64IV-NEXT: addi sp, sp, -16
	; RV64IV-NEXT: .cfi_def_cfa_offset 12304
	; RV64IV-NEXT: csrr t1, vlenb
	; RV64IV-NEXT: slli t1, t1, 7
	; RV64IV-NEXT: .cfi_def_cfa t1, -1024
	; RV64IV-NEXT: lui t2, 1
	; RV64IV-NEXT: .LBB5_1: # %entry
	; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1
	; RV64IV-NEXT: sub sp, sp, t2
	; RV64IV-NEXT: sd zero, 0(sp)
	; RV64IV-NEXT: sub t1, t1, t2
	; RV64IV-NEXT: bge t1, t2, .LBB5_1
	; RV64IV-NEXT: # %bb.2: # %entry
	; RV64IV-NEXT: .cfi_def_cfa_register sp
	; RV64IV-NEXT: sub sp, sp, t1
	; RV64IV-NEXT: .cfi_escape 0x0f, 0x10, 0x72, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 12304 + 128 * vlenb
	; RV64IV-NEXT: csrr a0, vlenb
	; RV64IV-NEXT: slli a0, a0, 7
	; RV64IV-NEXT: add sp, sp, a0
	; RV64IV-NEXT: .cfi_def_cfa sp, 12304
	; RV64IV-NEXT: lui a0, 3
	; RV64IV-NEXT: addi a0, a0, 16
	; RV64IV-NEXT: add sp, sp, a0
	; RV64IV-NEXT: .cfi_def_cfa_offset 0
	; RV64IV-NEXT: ret
	;
	; RV32IV-LABEL: f1_vector_4096_arr:
	; RV32IV: # %bb.0: # %entry
	; RV32IV-NEXT: lui a0, 1
	; RV32IV-NEXT: sub sp, sp, a0
	; RV32IV-NEXT: sw zero, 0(sp)
	; RV32IV-NEXT: .cfi_def_cfa_offset 4096
	; RV32IV-NEXT: lui a0, 1
	; RV32IV-NEXT: sub sp, sp, a0
	; RV32IV-NEXT: sw zero, 0(sp)
	; RV32IV-NEXT: .cfi_def_cfa_offset 8192
	; RV32IV-NEXT: lui a0, 1
	; RV32IV-NEXT: sub sp, sp, a0
	; RV32IV-NEXT: sw zero, 0(sp)
	; RV32IV-NEXT: .cfi_def_cfa_offset 12288
	; RV32IV-NEXT: addi sp, sp, -16
	; RV32IV-NEXT: .cfi_def_cfa_offset 12304
	; RV32IV-NEXT: csrr t1, vlenb
	; RV32IV-NEXT: slli t1, t1, 7
	; RV32IV-NEXT: .cfi_def_cfa t1, -1024
	; RV32IV-NEXT: lui t2, 1
	; RV32IV-NEXT: .LBB5_1: # %entry
	; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1
	; RV32IV-NEXT: sub sp, sp, t2
	; RV32IV-NEXT: sw zero, 0(sp)
	; RV32IV-NEXT: sub t1, t1, t2
	; RV32IV-NEXT: bge t1, t2, .LBB5_1
	; RV32IV-NEXT: # %bb.2: # %entry
	; RV32IV-NEXT: .cfi_def_cfa_register sp
	; RV32IV-NEXT: sub sp, sp, t1
	; RV32IV-NEXT: .cfi_escape 0x0f, 0x10, 0x72, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 12304 + 128 * vlenb
	; RV32IV-NEXT: csrr a0, vlenb
	; RV32IV-NEXT: slli a0, a0, 7
	; RV32IV-NEXT: add sp, sp, a0
	; RV32IV-NEXT: .cfi_def_cfa sp, 12304
	; RV32IV-NEXT: lui a0, 3
	; RV32IV-NEXT: addi a0, a0, 16
	; RV32IV-NEXT: add sp, sp, a0
	; RV32IV-NEXT: .cfi_def_cfa_offset 0
	; RV32IV-NEXT: ret
	entry:
	%vec = alloca <vscale x 256 x float>, align 16
	%arr = alloca i8, i64 12288, align 1
	ret void
	}

	attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }