blob: dcdf548020e9d0a7bbbe371d3dd059c1f427c319 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+b | FileCheck %s --check-prefixes=CHECK,V
; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+b,+zvfh | FileCheck %s --check-prefixes=CHECK,ZVFH
declare void @g()
define void @f(ptr %m, ptr %n, ptr %p, ptr %q, ptr %r, ptr %s, double %t) {
; CHECK-LABEL: f:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -48
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: sd ra, 40(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 32(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s1, 24(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s2, 16(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s3, 8(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s4, 0(sp) # 8-byte Folded Spill
; CHECK-NEXT: .cfi_offset ra, -8
; CHECK-NEXT: .cfi_offset s0, -16
; CHECK-NEXT: .cfi_offset s1, -24
; CHECK-NEXT: .cfi_offset s2, -32
; CHECK-NEXT: .cfi_offset s3, -40
; CHECK-NEXT: .cfi_offset s4, -48
; CHECK-NEXT: mv s0, a5
; CHECK-NEXT: mv s1, a4
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
; CHECK-NEXT: vse64.v v8, (a1)
; CHECK-NEXT: ld s3, 0(a2)
; CHECK-NEXT: ld s4, 8(a2)
; CHECK-NEXT: mv s2, a3
; CHECK-NEXT: call g
; CHECK-NEXT: sd s3, 0(s2)
; CHECK-NEXT: sd s4, 8(s2)
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT: vle64.v v8, (s1)
; CHECK-NEXT: vse64.v v8, (s0)
; CHECK-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s2, 16(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s3, 8(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s4, 0(sp) # 8-byte Folded Reload
; CHECK-NEXT: .cfi_restore ra
; CHECK-NEXT: .cfi_restore s0
; CHECK-NEXT: .cfi_restore s1
; CHECK-NEXT: .cfi_restore s2
; CHECK-NEXT: .cfi_restore s3
; CHECK-NEXT: .cfi_restore s4
; CHECK-NEXT: addi sp, sp, 48
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%z0 = load i64, ptr %m
%m.1 = getelementptr i64, ptr %m, i64 1
%z1 = load i64, ptr %m.1
store i64 %z0, ptr %n
%n.1 = getelementptr i64, ptr %n, i64 1
store i64 %z1, ptr %n.1
%x0 = load i64, ptr %p
%p.1 = getelementptr i64, ptr %p, i64 1
%x1 = load i64, ptr %p.1
call void @g()
store i64 %x0, ptr %q
%q.1 = getelementptr i64, ptr %q, i64 1
store i64 %x1, ptr %q.1
%y0 = load i64, ptr %r
%r.1 = getelementptr i64, ptr %r, i64 1
%y1 = load i64, ptr %r.1
store i64 %y0, ptr %s
%s.1 = getelementptr i64, ptr %s, i64 1
store i64 %y1, ptr %s.1
ret void
}
define void @f1(ptr %p, ptr %q, double %t) {
; CHECK-LABEL: f1:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
; CHECK-NEXT: fcvt.wu.d a0, fa0, rtz
; CHECK-NEXT: vse64.v v8, (a1)
; CHECK-NEXT: ret
%x0 = load i64, ptr %p
%p.1 = getelementptr i64, ptr %p, i64 1
%x1 = load i64, ptr %p.1
%t1 = call i32 @llvm.experimental.constrained.fptoui.i32.f64(double %t, metadata !"fpexcept.strict")
store i64 %x0, ptr %q
%q.1 = getelementptr i64, ptr %q, i64 1
store i64 %x1, ptr %q.1
ret void
}
; Merging scalars is profitable, it reduces pressure within a single
; register class.
define void @i8_i16(ptr %p, ptr %q) {
; CHECK-LABEL: i8_i16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -32
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; CHECK-NEXT: .cfi_offset ra, -8
; CHECK-NEXT: .cfi_offset s0, -16
; CHECK-NEXT: .cfi_offset s1, -24
; CHECK-NEXT: lh s1, 0(a0)
; CHECK-NEXT: mv s0, a1
; CHECK-NEXT: call g
; CHECK-NEXT: sh s1, 0(s0)
; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
; CHECK-NEXT: .cfi_restore ra
; CHECK-NEXT: .cfi_restore s0
; CHECK-NEXT: .cfi_restore s1
; CHECK-NEXT: addi sp, sp, 32
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%p0 = getelementptr i8, ptr %p, i64 0
%p1 = getelementptr i8, ptr %p, i64 1
%x0 = load i8, ptr %p0, align 2
%x1 = load i8, ptr %p1
call void @g()
%q0 = getelementptr i8, ptr %q, i64 0
%q1 = getelementptr i8, ptr %q, i64 1
store i8 %x0, ptr %q0, align 2
store i8 %x1, ptr %q1
ret void
}
define void @i8_i16_rotate(ptr %p, ptr %q) {
; CHECK-LABEL: i8_i16_rotate:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -32
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s2, 0(sp) # 8-byte Folded Spill
; CHECK-NEXT: .cfi_offset ra, -8
; CHECK-NEXT: .cfi_offset s0, -16
; CHECK-NEXT: .cfi_offset s1, -24
; CHECK-NEXT: .cfi_offset s2, -32
; CHECK-NEXT: lbu s1, 0(a0)
; CHECK-NEXT: lbu s2, 1(a0)
; CHECK-NEXT: mv s0, a1
; CHECK-NEXT: call g
; CHECK-NEXT: sb s2, 0(s0)
; CHECK-NEXT: sb s1, 1(s0)
; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s2, 0(sp) # 8-byte Folded Reload
; CHECK-NEXT: .cfi_restore ra
; CHECK-NEXT: .cfi_restore s0
; CHECK-NEXT: .cfi_restore s1
; CHECK-NEXT: .cfi_restore s2
; CHECK-NEXT: addi sp, sp, 32
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%p0 = getelementptr i8, ptr %p, i64 0
%p1 = getelementptr i8, ptr %p, i64 1
%x0 = load i8, ptr %p0, align 2
%x1 = load i8, ptr %p1
call void @g()
%q0 = getelementptr i8, ptr %q, i64 0
%q1 = getelementptr i8, ptr %q, i64 1
store i8 %x1, ptr %q0, align 2
store i8 %x0, ptr %q1
ret void
}
; We could reorder the first call and the load here to enable
; merging, but don't currently do so.
define void @i8_i16_resched_readnone_ld(ptr %p, ptr %q) {
; CHECK-LABEL: i8_i16_resched_readnone_ld:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -32
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s2, 0(sp) # 8-byte Folded Spill
; CHECK-NEXT: .cfi_offset ra, -8
; CHECK-NEXT: .cfi_offset s0, -16
; CHECK-NEXT: .cfi_offset s1, -24
; CHECK-NEXT: .cfi_offset s2, -32
; CHECK-NEXT: mv s0, a0
; CHECK-NEXT: lbu s2, 0(a0)
; CHECK-NEXT: mv s1, a1
; CHECK-NEXT: call g
; CHECK-NEXT: lbu s0, 1(s0)
; CHECK-NEXT: call g
; CHECK-NEXT: sb s2, 0(s1)
; CHECK-NEXT: sb s0, 1(s1)
; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s2, 0(sp) # 8-byte Folded Reload
; CHECK-NEXT: .cfi_restore ra
; CHECK-NEXT: .cfi_restore s0
; CHECK-NEXT: .cfi_restore s1
; CHECK-NEXT: .cfi_restore s2
; CHECK-NEXT: addi sp, sp, 32
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%p0 = getelementptr i8, ptr %p, i64 0
%p1 = getelementptr i8, ptr %p, i64 1
%x0 = load i8, ptr %p0, align 2
call void @g() readnone
%x1 = load i8, ptr %p1
call void @g()
%q0 = getelementptr i8, ptr %q, i64 0
%q1 = getelementptr i8, ptr %q, i64 1
store i8 %x0, ptr %q0, align 2
store i8 %x1, ptr %q1
ret void
}
; We could reorder the second call and the store here to
; enable merging, but don't currently do so.
define void @i8_i16_resched_readnone_st(ptr %p, ptr %q) {
; CHECK-LABEL: i8_i16_resched_readnone_st:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -32
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s2, 0(sp) # 8-byte Folded Spill
; CHECK-NEXT: .cfi_offset ra, -8
; CHECK-NEXT: .cfi_offset s0, -16
; CHECK-NEXT: .cfi_offset s1, -24
; CHECK-NEXT: .cfi_offset s2, -32
; CHECK-NEXT: lbu s1, 0(a0)
; CHECK-NEXT: lbu s2, 1(a0)
; CHECK-NEXT: mv s0, a1
; CHECK-NEXT: call g
; CHECK-NEXT: sb s1, 0(s0)
; CHECK-NEXT: call g
; CHECK-NEXT: sb s2, 1(s0)
; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s2, 0(sp) # 8-byte Folded Reload
; CHECK-NEXT: .cfi_restore ra
; CHECK-NEXT: .cfi_restore s0
; CHECK-NEXT: .cfi_restore s1
; CHECK-NEXT: .cfi_restore s2
; CHECK-NEXT: addi sp, sp, 32
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%p0 = getelementptr i8, ptr %p, i64 0
%p1 = getelementptr i8, ptr %p, i64 1
%x0 = load i8, ptr %p0, align 2
%x1 = load i8, ptr %p1
call void @g()
%q0 = getelementptr i8, ptr %q, i64 0
store i8 %x0, ptr %q0, align 2
call void @g() readnone
%q1 = getelementptr i8, ptr %q, i64 1
store i8 %x1, ptr %q1
ret void
}
define void @i32_i64(ptr %p, ptr %q) {
; CHECK-LABEL: i32_i64:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -32
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; CHECK-NEXT: .cfi_offset ra, -8
; CHECK-NEXT: .cfi_offset s0, -16
; CHECK-NEXT: .cfi_offset s1, -24
; CHECK-NEXT: ld s1, 0(a0)
; CHECK-NEXT: mv s0, a1
; CHECK-NEXT: call g
; CHECK-NEXT: sd s1, 0(s0)
; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
; CHECK-NEXT: .cfi_restore ra
; CHECK-NEXT: .cfi_restore s0
; CHECK-NEXT: .cfi_restore s1
; CHECK-NEXT: addi sp, sp, 32
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%p0 = getelementptr i8, ptr %p, i64 0
%p1 = getelementptr i8, ptr %p, i64 4
%x0 = load i32, ptr %p0, align 8
%x1 = load i32, ptr %p1
call void @g()
%q0 = getelementptr i8, ptr %q, i64 0
%q1 = getelementptr i8, ptr %q, i64 4
store i32 %x0, ptr %q0, align 8
store i32 %x1, ptr %q1
ret void
}
define void @i32_i64_rotate(ptr %p, ptr %q) {
; CHECK-LABEL: i32_i64_rotate:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -32
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; CHECK-NEXT: .cfi_offset ra, -8
; CHECK-NEXT: .cfi_offset s0, -16
; CHECK-NEXT: .cfi_offset s1, -24
; CHECK-NEXT: mv s0, a1
; CHECK-NEXT: ld a0, 0(a0)
; CHECK-NEXT: rori s1, a0, 32
; CHECK-NEXT: call g
; CHECK-NEXT: sd s1, 0(s0)
; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
; CHECK-NEXT: .cfi_restore ra
; CHECK-NEXT: .cfi_restore s0
; CHECK-NEXT: .cfi_restore s1
; CHECK-NEXT: addi sp, sp, 32
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%p0 = getelementptr i8, ptr %p, i64 0
%p1 = getelementptr i8, ptr %p, i64 4
%x0 = load i32, ptr %p0, align 8
%x1 = load i32, ptr %p1
call void @g()
%q0 = getelementptr i8, ptr %q, i64 0
%q1 = getelementptr i8, ptr %q, i64 4
store i32 %x1, ptr %q0, align 8
store i32 %x0, ptr %q1
ret void
}
; Merging vectors is profitable, it reduces pressure within a single
; register class.
define void @v2i8_v4i8(ptr %p, ptr %q) {
; CHECK-LABEL: v2i8_v4i8:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -32
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; CHECK-NEXT: .cfi_offset ra, -8
; CHECK-NEXT: .cfi_offset s0, -16
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: sub sp, sp, a2
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 1 * vlenb
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
; CHECK-NEXT: mv s0, a1
; CHECK-NEXT: call g
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; CHECK-NEXT: vse8.v v8, (s0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 32
; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NEXT: .cfi_restore ra
; CHECK-NEXT: .cfi_restore s0
; CHECK-NEXT: addi sp, sp, 32
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%p0 = getelementptr i8, ptr %p, i64 0
%p1 = getelementptr i8, ptr %p, i64 2
%x0 = load <2 x i8>, ptr %p0, align 2
%x1 = load <2 x i8>, ptr %p1
call void @g()
%q0 = getelementptr i8, ptr %q, i64 0
%q1 = getelementptr i8, ptr %q, i64 2
store <2 x i8> %x0, ptr %q0, align 2
store <2 x i8> %x1, ptr %q1
ret void
}
; Merging two 16 x i8 into one 32 x i8 (on zvl128b) will require the same
; numbers of registers to be spilled, but it can be done with fewer
; instructions
define void @v16i8_v32i8(ptr %p, ptr %q) {
; CHECK-LABEL: v16i8_v32i8:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -64
; CHECK-NEXT: .cfi_def_cfa_offset 64
; CHECK-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s1, 40(sp) # 8-byte Folded Spill
; CHECK-NEXT: .cfi_offset ra, -8
; CHECK-NEXT: .cfi_offset s0, -16
; CHECK-NEXT: .cfi_offset s1, -24
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 1
; CHECK-NEXT: sub sp, sp, a2
; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb
; CHECK-NEXT: li s1, 32
; CHECK-NEXT: vsetvli zero, s1, e8, m2, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 32
; CHECK-NEXT: vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
; CHECK-NEXT: mv s0, a1
; CHECK-NEXT: call g
; CHECK-NEXT: addi a0, sp, 32
; CHECK-NEXT: vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
; CHECK-NEXT: vsetvli zero, s1, e8, m2, ta, ma
; CHECK-NEXT: vse8.v v8, (s0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: sh1add sp, a0, sp
; CHECK-NEXT: .cfi_def_cfa sp, 64
; CHECK-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s1, 40(sp) # 8-byte Folded Reload
; CHECK-NEXT: .cfi_restore ra
; CHECK-NEXT: .cfi_restore s0
; CHECK-NEXT: .cfi_restore s1
; CHECK-NEXT: addi sp, sp, 64
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%p0 = getelementptr i8, ptr %p, i64 0
%p1 = getelementptr i8, ptr %p, i64 16
%x0 = load <16 x i8>, ptr %p0, align 2
%x1 = load <16 x i8>, ptr %p1
call void @g()
%q0 = getelementptr i8, ptr %q, i64 0
%q1 = getelementptr i8, ptr %q, i64 16
store <16 x i8> %x0, ptr %q0, align 2
store <16 x i8> %x1, ptr %q1
ret void
}
define void @two_half(ptr %p, ptr %q) {
; CHECK-LABEL: two_half:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -32
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; CHECK-NEXT: .cfi_offset ra, -8
; CHECK-NEXT: .cfi_offset s0, -16
; CHECK-NEXT: .cfi_offset s1, -24
; CHECK-NEXT: lw s1, 0(a0)
; CHECK-NEXT: mv s0, a1
; CHECK-NEXT: call g
; CHECK-NEXT: sw s1, 0(s0)
; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
; CHECK-NEXT: .cfi_restore ra
; CHECK-NEXT: .cfi_restore s0
; CHECK-NEXT: .cfi_restore s1
; CHECK-NEXT: addi sp, sp, 32
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%p0 = getelementptr i8, ptr %p, i64 0
%p1 = getelementptr i8, ptr %p, i64 2
%x0 = load half, ptr %p0, align 4
%x1 = load half, ptr %p1
call void @g()
%q0 = getelementptr i8, ptr %q, i64 0
%q1 = getelementptr i8, ptr %q, i64 2
store half %x0, ptr %q0, align 4
store half %x1, ptr %q1
ret void
}
define void @two_half_unaligned(ptr %p, ptr %q) {
; V-LABEL: two_half_unaligned:
; V: # %bb.0:
; V-NEXT: addi sp, sp, -32
; V-NEXT: .cfi_def_cfa_offset 32
; V-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; V-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; V-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; V-NEXT: sd s2, 0(sp) # 8-byte Folded Spill
; V-NEXT: .cfi_offset ra, -8
; V-NEXT: .cfi_offset s0, -16
; V-NEXT: .cfi_offset s1, -24
; V-NEXT: .cfi_offset s2, -32
; V-NEXT: lh s1, 0(a0)
; V-NEXT: lh s2, 2(a0)
; V-NEXT: mv s0, a1
; V-NEXT: call g
; V-NEXT: sh s1, 0(s0)
; V-NEXT: sh s2, 2(s0)
; V-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; V-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; V-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
; V-NEXT: ld s2, 0(sp) # 8-byte Folded Reload
; V-NEXT: .cfi_restore ra
; V-NEXT: .cfi_restore s0
; V-NEXT: .cfi_restore s1
; V-NEXT: .cfi_restore s2
; V-NEXT: addi sp, sp, 32
; V-NEXT: .cfi_def_cfa_offset 0
; V-NEXT: ret
;
; ZVFH-LABEL: two_half_unaligned:
; ZVFH: # %bb.0:
; ZVFH-NEXT: addi sp, sp, -32
; ZVFH-NEXT: .cfi_def_cfa_offset 32
; ZVFH-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; ZVFH-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; ZVFH-NEXT: fsd fs0, 8(sp) # 8-byte Folded Spill
; ZVFH-NEXT: fsd fs1, 0(sp) # 8-byte Folded Spill
; ZVFH-NEXT: .cfi_offset ra, -8
; ZVFH-NEXT: .cfi_offset s0, -16
; ZVFH-NEXT: .cfi_offset fs0, -24
; ZVFH-NEXT: .cfi_offset fs1, -32
; ZVFH-NEXT: flh fs0, 0(a0)
; ZVFH-NEXT: flh fs1, 2(a0)
; ZVFH-NEXT: mv s0, a1
; ZVFH-NEXT: call g
; ZVFH-NEXT: fsh fs0, 0(s0)
; ZVFH-NEXT: fsh fs1, 2(s0)
; ZVFH-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; ZVFH-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; ZVFH-NEXT: fld fs0, 8(sp) # 8-byte Folded Reload
; ZVFH-NEXT: fld fs1, 0(sp) # 8-byte Folded Reload
; ZVFH-NEXT: .cfi_restore ra
; ZVFH-NEXT: .cfi_restore s0
; ZVFH-NEXT: .cfi_restore fs0
; ZVFH-NEXT: .cfi_restore fs1
; ZVFH-NEXT: addi sp, sp, 32
; ZVFH-NEXT: .cfi_def_cfa_offset 0
; ZVFH-NEXT: ret
%p0 = getelementptr i8, ptr %p, i64 0
%p1 = getelementptr i8, ptr %p, i64 2
%x0 = load half, ptr %p0
%x1 = load half, ptr %p1
call void @g()
%q0 = getelementptr i8, ptr %q, i64 0
%q1 = getelementptr i8, ptr %q, i64 2
store half %x0, ptr %q0
store half %x1, ptr %q1
ret void
}
define void @two_float(ptr %p, ptr %q) {
; CHECK-LABEL: two_float:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -32
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; CHECK-NEXT: .cfi_offset ra, -8
; CHECK-NEXT: .cfi_offset s0, -16
; CHECK-NEXT: .cfi_offset s1, -24
; CHECK-NEXT: ld s1, 0(a0)
; CHECK-NEXT: mv s0, a1
; CHECK-NEXT: call g
; CHECK-NEXT: sd s1, 0(s0)
; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
; CHECK-NEXT: .cfi_restore ra
; CHECK-NEXT: .cfi_restore s0
; CHECK-NEXT: .cfi_restore s1
; CHECK-NEXT: addi sp, sp, 32
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%p0 = getelementptr i8, ptr %p, i64 0
%p1 = getelementptr i8, ptr %p, i64 4
%x0 = load float, ptr %p0, align 8
%x1 = load float, ptr %p1
call void @g()
%q0 = getelementptr i8, ptr %q, i64 0
%q1 = getelementptr i8, ptr %q, i64 4
store float %x0, ptr %q0, align 8
store float %x1, ptr %q1
ret void
}
define void @two_float_unaligned(ptr %p, ptr %q) {
; CHECK-LABEL: two_float_unaligned:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -32
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; CHECK-NEXT: fsd fs0, 8(sp) # 8-byte Folded Spill
; CHECK-NEXT: fsd fs1, 0(sp) # 8-byte Folded Spill
; CHECK-NEXT: .cfi_offset ra, -8
; CHECK-NEXT: .cfi_offset s0, -16
; CHECK-NEXT: .cfi_offset fs0, -24
; CHECK-NEXT: .cfi_offset fs1, -32
; CHECK-NEXT: flw fs0, 0(a0)
; CHECK-NEXT: flw fs1, 4(a0)
; CHECK-NEXT: mv s0, a1
; CHECK-NEXT: call g
; CHECK-NEXT: fsw fs0, 0(s0)
; CHECK-NEXT: fsw fs1, 4(s0)
; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NEXT: fld fs0, 8(sp) # 8-byte Folded Reload
; CHECK-NEXT: fld fs1, 0(sp) # 8-byte Folded Reload
; CHECK-NEXT: .cfi_restore ra
; CHECK-NEXT: .cfi_restore s0
; CHECK-NEXT: .cfi_restore fs0
; CHECK-NEXT: .cfi_restore fs1
; CHECK-NEXT: addi sp, sp, 32
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%p0 = getelementptr i8, ptr %p, i64 0
%p1 = getelementptr i8, ptr %p, i64 4
%x0 = load float, ptr %p0
%x1 = load float, ptr %p1
call void @g()
%q0 = getelementptr i8, ptr %q, i64 0
%q1 = getelementptr i8, ptr %q, i64 4
store float %x0, ptr %q0
store float %x1, ptr %q1
ret void
}
define void @two_float_rotate(ptr %p, ptr %q) {
; CHECK-LABEL: two_float_rotate:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -32
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; CHECK-NEXT: .cfi_offset ra, -8
; CHECK-NEXT: .cfi_offset s0, -16
; CHECK-NEXT: .cfi_offset s1, -24
; CHECK-NEXT: mv s0, a1
; CHECK-NEXT: ld a0, 0(a0)
; CHECK-NEXT: rori s1, a0, 32
; CHECK-NEXT: call g
; CHECK-NEXT: sd s1, 0(s0)
; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
; CHECK-NEXT: .cfi_restore ra
; CHECK-NEXT: .cfi_restore s0
; CHECK-NEXT: .cfi_restore s1
; CHECK-NEXT: addi sp, sp, 32
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%p0 = getelementptr i8, ptr %p, i64 0
%p1 = getelementptr i8, ptr %p, i64 4
%x0 = load float, ptr %p0, align 8
%x1 = load float, ptr %p1
call void @g()
%q0 = getelementptr i8, ptr %q, i64 0
%q1 = getelementptr i8, ptr %q, i64 4
store float %x1, ptr %q0, align 8
store float %x0, ptr %q1
ret void
}
define void @two_double(ptr %p, ptr %q) {
; CHECK-LABEL: two_double:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -32
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; CHECK-NEXT: fsd fs0, 8(sp) # 8-byte Folded Spill
; CHECK-NEXT: fsd fs1, 0(sp) # 8-byte Folded Spill
; CHECK-NEXT: .cfi_offset ra, -8
; CHECK-NEXT: .cfi_offset s0, -16
; CHECK-NEXT: .cfi_offset fs0, -24
; CHECK-NEXT: .cfi_offset fs1, -32
; CHECK-NEXT: fld fs0, 0(a0)
; CHECK-NEXT: fld fs1, 8(a0)
; CHECK-NEXT: mv s0, a1
; CHECK-NEXT: call g
; CHECK-NEXT: fsd fs0, 0(s0)
; CHECK-NEXT: fsd fs1, 8(s0)
; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NEXT: fld fs0, 8(sp) # 8-byte Folded Reload
; CHECK-NEXT: fld fs1, 0(sp) # 8-byte Folded Reload
; CHECK-NEXT: .cfi_restore ra
; CHECK-NEXT: .cfi_restore s0
; CHECK-NEXT: .cfi_restore fs0
; CHECK-NEXT: .cfi_restore fs1
; CHECK-NEXT: addi sp, sp, 32
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%p0 = getelementptr i8, ptr %p, i64 0
%p1 = getelementptr i8, ptr %p, i64 8
%x0 = load double, ptr %p0
%x1 = load double, ptr %p1
call void @g()
%q0 = getelementptr i8, ptr %q, i64 0
%q1 = getelementptr i8, ptr %q, i64 8
store double %x0, ptr %q0
store double %x1, ptr %q1
ret void
}