[RISCV] Prevent P extension from creating unaligned scalar load/store insructions. (#174878)
The P extension requires us to use base ISA load/store instructions for
small vectors. We need to make sure we don't generate misaligned
instructions.
We'll need to do more work here if we want P and V to be enabled at the
same time, but that's a future problem.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index add1e71..17a4bc0 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -25365,7 +25365,7 @@
bool RISCVTargetLowering::allowsMisalignedMemoryAccesses(
EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
unsigned *Fast) const {
- if (!VT.isVector()) {
+ if (!VT.isVector() || Subtarget.enablePExtSIMDCodeGen()) {
if (Fast)
*Fast = Subtarget.enableUnalignedScalarMem();
return Subtarget.enableUnalignedScalarMem();
diff --git a/llvm/test/CodeGen/RISCV/rvp-unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/rvp-unaligned-load-store.ll
new file mode 100644
index 0000000..32dd7bc
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvp-unaligned-load-store.ll
@@ -0,0 +1,1210 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-p \
+; RUN: -riscv-enable-p-ext-simd-codegen < %s \
+; RUN: | FileCheck --check-prefixes=CHECK,CHECK-RV32 %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-p \
+; RUN: -riscv-enable-p-ext-simd-codegen < %s \
+; RUN: | FileCheck --check-prefixes=CHECK,CHECK-RV64 %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-p,+unaligned-scalar-mem \
+; RUN: -riscv-enable-p-ext-simd-codegen < %s \
+; RUN: | FileCheck --check-prefixes=UNALIGNED,UNALIGNED-RV32 %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-p,+unaligned-scalar-mem \
+; RUN: -riscv-enable-p-ext-simd-codegen < %s \
+; RUN: | FileCheck --check-prefixes=UNALIGNED,UNALIGNED-RV64 %s
+
+define void @test_load_v4i8_align1(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v4i8_align1:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lbu a2, 5(a1)
+; CHECK-RV32-NEXT: lbu a3, 6(a1)
+; CHECK-RV32-NEXT: lbu a4, 7(a1)
+; CHECK-RV32-NEXT: lbu a5, 4(a1)
+; CHECK-RV32-NEXT: slli a2, a2, 8
+; CHECK-RV32-NEXT: slli a3, a3, 16
+; CHECK-RV32-NEXT: slli a4, a4, 24
+; CHECK-RV32-NEXT: or a2, a2, a5
+; CHECK-RV32-NEXT: or a3, a4, a3
+; CHECK-RV32-NEXT: lbu a4, 1(a1)
+; CHECK-RV32-NEXT: lbu a5, 0(a1)
+; CHECK-RV32-NEXT: lbu a6, 2(a1)
+; CHECK-RV32-NEXT: lbu a1, 3(a1)
+; CHECK-RV32-NEXT: slli a4, a4, 8
+; CHECK-RV32-NEXT: or a4, a4, a5
+; CHECK-RV32-NEXT: slli a6, a6, 16
+; CHECK-RV32-NEXT: slli a1, a1, 24
+; CHECK-RV32-NEXT: or a1, a1, a6
+; CHECK-RV32-NEXT: or a2, a3, a2
+; CHECK-RV32-NEXT: or a1, a1, a4
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: sw a2, 4(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_load_v4i8_align1:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lbu a2, 5(a1)
+; CHECK-RV64-NEXT: lbu a3, 6(a1)
+; CHECK-RV64-NEXT: lbu a4, 7(a1)
+; CHECK-RV64-NEXT: lbu a5, 4(a1)
+; CHECK-RV64-NEXT: slli a2, a2, 8
+; CHECK-RV64-NEXT: slli a3, a3, 16
+; CHECK-RV64-NEXT: slli a4, a4, 24
+; CHECK-RV64-NEXT: or a2, a2, a5
+; CHECK-RV64-NEXT: or a3, a4, a3
+; CHECK-RV64-NEXT: lbu a4, 1(a1)
+; CHECK-RV64-NEXT: lbu a5, 0(a1)
+; CHECK-RV64-NEXT: lbu a6, 2(a1)
+; CHECK-RV64-NEXT: lbu a1, 3(a1)
+; CHECK-RV64-NEXT: slli a4, a4, 8
+; CHECK-RV64-NEXT: or a4, a4, a5
+; CHECK-RV64-NEXT: slli a6, a6, 16
+; CHECK-RV64-NEXT: slli a1, a1, 24
+; CHECK-RV64-NEXT: or a1, a1, a6
+; CHECK-RV64-NEXT: or a2, a3, a2
+; CHECK-RV64-NEXT: or a1, a1, a4
+; CHECK-RV64-NEXT: pack a1, a1, a2
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v4i8_align1:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v4i8_align1:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT: sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <8 x i8>, ptr %a_ptr, align 1
+ store <8 x i8> %a, ptr %ret_ptr
+ ret void
+}
+
+define void @test_store_v4i8_align1(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v4i8_align1:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 4(a0)
+; CHECK-RV32-NEXT: lw a0, 0(a0)
+; CHECK-RV32-NEXT: srli a3, a2, 24
+; CHECK-RV32-NEXT: srli a4, a2, 16
+; CHECK-RV32-NEXT: srli a5, a2, 8
+; CHECK-RV32-NEXT: srli a6, a0, 24
+; CHECK-RV32-NEXT: srli a7, a0, 16
+; CHECK-RV32-NEXT: sb a2, 4(a1)
+; CHECK-RV32-NEXT: sb a5, 5(a1)
+; CHECK-RV32-NEXT: sb a4, 6(a1)
+; CHECK-RV32-NEXT: sb a3, 7(a1)
+; CHECK-RV32-NEXT: srli a2, a0, 8
+; CHECK-RV32-NEXT: sb a0, 0(a1)
+; CHECK-RV32-NEXT: sb a2, 1(a1)
+; CHECK-RV32-NEXT: sb a7, 2(a1)
+; CHECK-RV32-NEXT: sb a6, 3(a1)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_store_v4i8_align1:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ld a0, 0(a0)
+; CHECK-RV64-NEXT: srli a2, a0, 56
+; CHECK-RV64-NEXT: srli a3, a0, 48
+; CHECK-RV64-NEXT: srli a4, a0, 40
+; CHECK-RV64-NEXT: srli a5, a0, 32
+; CHECK-RV64-NEXT: srli a6, a0, 24
+; CHECK-RV64-NEXT: srli a7, a0, 16
+; CHECK-RV64-NEXT: sb a5, 4(a1)
+; CHECK-RV64-NEXT: sb a4, 5(a1)
+; CHECK-RV64-NEXT: sb a3, 6(a1)
+; CHECK-RV64-NEXT: sb a2, 7(a1)
+; CHECK-RV64-NEXT: srli a2, a0, 8
+; CHECK-RV64-NEXT: sb a0, 0(a1)
+; CHECK-RV64-NEXT: sb a2, 1(a1)
+; CHECK-RV64-NEXT: sb a7, 2(a1)
+; CHECK-RV64-NEXT: sb a6, 3(a1)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v4i8_align1:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v4i8_align1:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT: sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <8 x i8>, ptr %a_ptr
+ store <8 x i8> %a, ptr %b_ptr, align 1
+ ret void
+}
+
+define void @test_load_v4i8_align2(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v4i8_align2:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lhu a2, 2(a1)
+; CHECK-RV32-NEXT: lhu a3, 4(a1)
+; CHECK-RV32-NEXT: lhu a4, 6(a1)
+; CHECK-RV32-NEXT: lhu a1, 0(a1)
+; CHECK-RV32-NEXT: pack a3, a3, a4
+; CHECK-RV32-NEXT: pack a1, a1, a2
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: sw a3, 4(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_load_v4i8_align2:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lhu a2, 2(a1)
+; CHECK-RV64-NEXT: lhu a3, 0(a1)
+; CHECK-RV64-NEXT: lhu a4, 4(a1)
+; CHECK-RV64-NEXT: lhu a1, 6(a1)
+; CHECK-RV64-NEXT: slli a2, a2, 16
+; CHECK-RV64-NEXT: or a2, a2, a3
+; CHECK-RV64-NEXT: slli a4, a4, 32
+; CHECK-RV64-NEXT: slli a1, a1, 48
+; CHECK-RV64-NEXT: or a1, a1, a4
+; CHECK-RV64-NEXT: or a1, a1, a2
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v4i8_align2:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v4i8_align2:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT: sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <8 x i8>, ptr %a_ptr, align 2
+ store <8 x i8> %a, ptr %ret_ptr
+ ret void
+}
+
+define void @test_store_v4i8_align2(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v4i8_align2:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 4(a0)
+; CHECK-RV32-NEXT: lw a0, 0(a0)
+; CHECK-RV32-NEXT: srli a3, a2, 16
+; CHECK-RV32-NEXT: srli a4, a0, 16
+; CHECK-RV32-NEXT: sh a0, 0(a1)
+; CHECK-RV32-NEXT: sh a4, 2(a1)
+; CHECK-RV32-NEXT: sh a2, 4(a1)
+; CHECK-RV32-NEXT: sh a3, 6(a1)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_store_v4i8_align2:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ld a0, 0(a0)
+; CHECK-RV64-NEXT: srli a2, a0, 48
+; CHECK-RV64-NEXT: srli a3, a0, 32
+; CHECK-RV64-NEXT: srli a4, a0, 16
+; CHECK-RV64-NEXT: sh a0, 0(a1)
+; CHECK-RV64-NEXT: sh a4, 2(a1)
+; CHECK-RV64-NEXT: sh a3, 4(a1)
+; CHECK-RV64-NEXT: sh a2, 6(a1)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v4i8_align2:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v4i8_align2:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT: sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <8 x i8>, ptr %a_ptr
+ store <8 x i8> %a, ptr %b_ptr, align 2
+ ret void
+}
+
+define void @test_load_v4i8_align4(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v4i8_align4:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 0(a1)
+; CHECK-RV32-NEXT: lw a1, 4(a1)
+; CHECK-RV32-NEXT: sw a2, 0(a0)
+; CHECK-RV32-NEXT: sw a1, 4(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_load_v4i8_align4:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lw a2, 0(a1)
+; CHECK-RV64-NEXT: lw a1, 4(a1)
+; CHECK-RV64-NEXT: pack a1, a2, a1
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v4i8_align4:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v4i8_align4:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT: sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <8 x i8>, ptr %a_ptr, align 4
+ store <8 x i8> %a, ptr %ret_ptr
+ ret void
+}
+
+define void @test_store_v4i8_align4(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v4i8_align4:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 0(a0)
+; CHECK-RV32-NEXT: lw a0, 4(a0)
+; CHECK-RV32-NEXT: sw a2, 0(a1)
+; CHECK-RV32-NEXT: sw a0, 4(a1)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_store_v4i8_align4:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ld a0, 0(a0)
+; CHECK-RV64-NEXT: srli a2, a0, 32
+; CHECK-RV64-NEXT: sw a0, 0(a1)
+; CHECK-RV64-NEXT: sw a2, 4(a1)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v4i8_align4:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v4i8_align4:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT: sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <8 x i8>, ptr %a_ptr
+ store <8 x i8> %a, ptr %b_ptr, align 4
+ ret void
+}
+
+define void @test_load_v2i16_align1(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v2i16_align1:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lbu a2, 1(a1)
+; CHECK-RV32-NEXT: lbu a3, 0(a1)
+; CHECK-RV32-NEXT: lbu a4, 2(a1)
+; CHECK-RV32-NEXT: lbu a1, 3(a1)
+; CHECK-RV32-NEXT: slli a2, a2, 8
+; CHECK-RV32-NEXT: or a2, a2, a3
+; CHECK-RV32-NEXT: slli a4, a4, 16
+; CHECK-RV32-NEXT: slli a1, a1, 24
+; CHECK-RV32-NEXT: or a1, a1, a4
+; CHECK-RV32-NEXT: or a1, a1, a2
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_load_v2i16_align1:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lbu a2, 1(a1)
+; CHECK-RV64-NEXT: lbu a3, 0(a1)
+; CHECK-RV64-NEXT: lbu a4, 2(a1)
+; CHECK-RV64-NEXT: lb a1, 3(a1)
+; CHECK-RV64-NEXT: slli a2, a2, 8
+; CHECK-RV64-NEXT: or a2, a2, a3
+; CHECK-RV64-NEXT: slli a4, a4, 16
+; CHECK-RV64-NEXT: slli a1, a1, 24
+; CHECK-RV64-NEXT: or a1, a1, a4
+; CHECK-RV64-NEXT: or a1, a1, a2
+; CHECK-RV64-NEXT: sw a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-LABEL: test_load_v2i16_align1:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: lw a1, 0(a1)
+; UNALIGNED-NEXT: sw a1, 0(a0)
+; UNALIGNED-NEXT: ret
+ %a = load <2 x i16>, ptr %a_ptr, align 1
+ store <2 x i16> %a, ptr %ret_ptr
+ ret void
+}
+
+define void @test_store_v2i16_align1(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_store_v2i16_align1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lw a0, 0(a0)
+; CHECK-NEXT: srli a2, a0, 24
+; CHECK-NEXT: srli a3, a0, 16
+; CHECK-NEXT: srli a4, a0, 8
+; CHECK-NEXT: sb a0, 0(a1)
+; CHECK-NEXT: sb a4, 1(a1)
+; CHECK-NEXT: sb a3, 2(a1)
+; CHECK-NEXT: sb a2, 3(a1)
+; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: test_store_v2i16_align1:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: lw a0, 0(a0)
+; UNALIGNED-NEXT: sw a0, 0(a1)
+; UNALIGNED-NEXT: ret
+ %a = load <2 x i16>, ptr %a_ptr
+ store <2 x i16> %a, ptr %b_ptr, align 1
+ ret void
+}
+
+define void @test_load_v2i16_align2(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v2i16_align2:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lhu a2, 0(a1)
+; CHECK-RV32-NEXT: lhu a1, 2(a1)
+; CHECK-RV32-NEXT: pack a1, a2, a1
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_load_v2i16_align2:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lh a2, 2(a1)
+; CHECK-RV64-NEXT: lhu a1, 0(a1)
+; CHECK-RV64-NEXT: slli a2, a2, 16
+; CHECK-RV64-NEXT: or a1, a2, a1
+; CHECK-RV64-NEXT: sw a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-LABEL: test_load_v2i16_align2:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: lw a1, 0(a1)
+; UNALIGNED-NEXT: sw a1, 0(a0)
+; UNALIGNED-NEXT: ret
+ %a = load <2 x i16>, ptr %a_ptr, align 2
+ store <2 x i16> %a, ptr %ret_ptr
+ ret void
+}
+
+define void @test_store_v2i16_align2(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_store_v2i16_align2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lw a0, 0(a0)
+; CHECK-NEXT: srli a2, a0, 16
+; CHECK-NEXT: sh a0, 0(a1)
+; CHECK-NEXT: sh a2, 2(a1)
+; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: test_store_v2i16_align2:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: lw a0, 0(a0)
+; UNALIGNED-NEXT: sw a0, 0(a1)
+; UNALIGNED-NEXT: ret
+ %a = load <2 x i16>, ptr %a_ptr
+ store <2 x i16> %a, ptr %b_ptr, align 2
+ ret void
+}
+
+define void @test_load_v8i8_align1(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v8i8_align1:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lbu a2, 5(a1)
+; CHECK-RV32-NEXT: lbu a3, 6(a1)
+; CHECK-RV32-NEXT: lbu a4, 7(a1)
+; CHECK-RV32-NEXT: lbu a5, 4(a1)
+; CHECK-RV32-NEXT: slli a2, a2, 8
+; CHECK-RV32-NEXT: slli a3, a3, 16
+; CHECK-RV32-NEXT: slli a4, a4, 24
+; CHECK-RV32-NEXT: or a2, a2, a5
+; CHECK-RV32-NEXT: or a3, a4, a3
+; CHECK-RV32-NEXT: lbu a4, 1(a1)
+; CHECK-RV32-NEXT: lbu a5, 0(a1)
+; CHECK-RV32-NEXT: lbu a6, 2(a1)
+; CHECK-RV32-NEXT: lbu a1, 3(a1)
+; CHECK-RV32-NEXT: slli a4, a4, 8
+; CHECK-RV32-NEXT: or a4, a4, a5
+; CHECK-RV32-NEXT: slli a6, a6, 16
+; CHECK-RV32-NEXT: slli a1, a1, 24
+; CHECK-RV32-NEXT: or a1, a1, a6
+; CHECK-RV32-NEXT: or a2, a3, a2
+; CHECK-RV32-NEXT: or a1, a1, a4
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: sw a2, 4(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_load_v8i8_align1:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lbu a2, 5(a1)
+; CHECK-RV64-NEXT: lbu a3, 6(a1)
+; CHECK-RV64-NEXT: lbu a4, 7(a1)
+; CHECK-RV64-NEXT: lbu a5, 4(a1)
+; CHECK-RV64-NEXT: slli a2, a2, 8
+; CHECK-RV64-NEXT: slli a3, a3, 16
+; CHECK-RV64-NEXT: slli a4, a4, 24
+; CHECK-RV64-NEXT: or a2, a2, a5
+; CHECK-RV64-NEXT: or a3, a4, a3
+; CHECK-RV64-NEXT: lbu a4, 1(a1)
+; CHECK-RV64-NEXT: lbu a5, 0(a1)
+; CHECK-RV64-NEXT: lbu a6, 2(a1)
+; CHECK-RV64-NEXT: lbu a1, 3(a1)
+; CHECK-RV64-NEXT: slli a4, a4, 8
+; CHECK-RV64-NEXT: or a4, a4, a5
+; CHECK-RV64-NEXT: slli a6, a6, 16
+; CHECK-RV64-NEXT: slli a1, a1, 24
+; CHECK-RV64-NEXT: or a1, a1, a6
+; CHECK-RV64-NEXT: or a2, a3, a2
+; CHECK-RV64-NEXT: or a1, a1, a4
+; CHECK-RV64-NEXT: pack a1, a1, a2
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v8i8_align1:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v8i8_align1:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT: sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <8 x i8>, ptr %a_ptr, align 1
+ store <8 x i8> %a, ptr %ret_ptr
+ ret void
+}
+
+define void @test_store_v8i8_align1(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v8i8_align1:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 4(a0)
+; CHECK-RV32-NEXT: lw a0, 0(a0)
+; CHECK-RV32-NEXT: srli a3, a2, 24
+; CHECK-RV32-NEXT: srli a4, a2, 16
+; CHECK-RV32-NEXT: srli a5, a2, 8
+; CHECK-RV32-NEXT: srli a6, a0, 24
+; CHECK-RV32-NEXT: srli a7, a0, 16
+; CHECK-RV32-NEXT: sb a2, 4(a1)
+; CHECK-RV32-NEXT: sb a5, 5(a1)
+; CHECK-RV32-NEXT: sb a4, 6(a1)
+; CHECK-RV32-NEXT: sb a3, 7(a1)
+; CHECK-RV32-NEXT: srli a2, a0, 8
+; CHECK-RV32-NEXT: sb a0, 0(a1)
+; CHECK-RV32-NEXT: sb a2, 1(a1)
+; CHECK-RV32-NEXT: sb a7, 2(a1)
+; CHECK-RV32-NEXT: sb a6, 3(a1)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_store_v8i8_align1:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ld a0, 0(a0)
+; CHECK-RV64-NEXT: srli a2, a0, 56
+; CHECK-RV64-NEXT: srli a3, a0, 48
+; CHECK-RV64-NEXT: srli a4, a0, 40
+; CHECK-RV64-NEXT: srli a5, a0, 32
+; CHECK-RV64-NEXT: srli a6, a0, 24
+; CHECK-RV64-NEXT: srli a7, a0, 16
+; CHECK-RV64-NEXT: sb a5, 4(a1)
+; CHECK-RV64-NEXT: sb a4, 5(a1)
+; CHECK-RV64-NEXT: sb a3, 6(a1)
+; CHECK-RV64-NEXT: sb a2, 7(a1)
+; CHECK-RV64-NEXT: srli a2, a0, 8
+; CHECK-RV64-NEXT: sb a0, 0(a1)
+; CHECK-RV64-NEXT: sb a2, 1(a1)
+; CHECK-RV64-NEXT: sb a7, 2(a1)
+; CHECK-RV64-NEXT: sb a6, 3(a1)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v8i8_align1:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v8i8_align1:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT: sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <8 x i8>, ptr %a_ptr
+ store <8 x i8> %a, ptr %b_ptr, align 1
+ ret void
+}
+
+define void @test_load_v8i8_align2(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v8i8_align2:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lhu a2, 2(a1)
+; CHECK-RV32-NEXT: lhu a3, 4(a1)
+; CHECK-RV32-NEXT: lhu a4, 6(a1)
+; CHECK-RV32-NEXT: lhu a1, 0(a1)
+; CHECK-RV32-NEXT: pack a3, a3, a4
+; CHECK-RV32-NEXT: pack a1, a1, a2
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: sw a3, 4(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_load_v8i8_align2:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lhu a2, 2(a1)
+; CHECK-RV64-NEXT: lhu a3, 0(a1)
+; CHECK-RV64-NEXT: lhu a4, 4(a1)
+; CHECK-RV64-NEXT: lhu a1, 6(a1)
+; CHECK-RV64-NEXT: slli a2, a2, 16
+; CHECK-RV64-NEXT: or a2, a2, a3
+; CHECK-RV64-NEXT: slli a4, a4, 32
+; CHECK-RV64-NEXT: slli a1, a1, 48
+; CHECK-RV64-NEXT: or a1, a1, a4
+; CHECK-RV64-NEXT: or a1, a1, a2
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v8i8_align2:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v8i8_align2:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT: sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <8 x i8>, ptr %a_ptr, align 2
+ store <8 x i8> %a, ptr %ret_ptr
+ ret void
+}
+
+define void @test_store_v8i8_align2(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v8i8_align2:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 4(a0)
+; CHECK-RV32-NEXT: lw a0, 0(a0)
+; CHECK-RV32-NEXT: srli a3, a2, 16
+; CHECK-RV32-NEXT: srli a4, a0, 16
+; CHECK-RV32-NEXT: sh a0, 0(a1)
+; CHECK-RV32-NEXT: sh a4, 2(a1)
+; CHECK-RV32-NEXT: sh a2, 4(a1)
+; CHECK-RV32-NEXT: sh a3, 6(a1)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_store_v8i8_align2:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ld a0, 0(a0)
+; CHECK-RV64-NEXT: srli a2, a0, 48
+; CHECK-RV64-NEXT: srli a3, a0, 32
+; CHECK-RV64-NEXT: srli a4, a0, 16
+; CHECK-RV64-NEXT: sh a0, 0(a1)
+; CHECK-RV64-NEXT: sh a4, 2(a1)
+; CHECK-RV64-NEXT: sh a3, 4(a1)
+; CHECK-RV64-NEXT: sh a2, 6(a1)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v8i8_align2:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v8i8_align2:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT: sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <8 x i8>, ptr %a_ptr
+ store <8 x i8> %a, ptr %b_ptr, align 2
+ ret void
+}
+
+define void @test_load_v4i16_align1(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v4i16_align1:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lbu a2, 5(a1)
+; CHECK-RV32-NEXT: lbu a3, 6(a1)
+; CHECK-RV32-NEXT: lbu a4, 7(a1)
+; CHECK-RV32-NEXT: lbu a5, 4(a1)
+; CHECK-RV32-NEXT: slli a2, a2, 8
+; CHECK-RV32-NEXT: slli a3, a3, 16
+; CHECK-RV32-NEXT: slli a4, a4, 24
+; CHECK-RV32-NEXT: or a2, a2, a5
+; CHECK-RV32-NEXT: or a3, a4, a3
+; CHECK-RV32-NEXT: lbu a4, 1(a1)
+; CHECK-RV32-NEXT: lbu a5, 0(a1)
+; CHECK-RV32-NEXT: lbu a6, 2(a1)
+; CHECK-RV32-NEXT: lbu a1, 3(a1)
+; CHECK-RV32-NEXT: slli a4, a4, 8
+; CHECK-RV32-NEXT: or a4, a4, a5
+; CHECK-RV32-NEXT: slli a6, a6, 16
+; CHECK-RV32-NEXT: slli a1, a1, 24
+; CHECK-RV32-NEXT: or a1, a1, a6
+; CHECK-RV32-NEXT: or a2, a3, a2
+; CHECK-RV32-NEXT: or a1, a1, a4
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: sw a2, 4(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_load_v4i16_align1:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lbu a2, 5(a1)
+; CHECK-RV64-NEXT: lbu a3, 6(a1)
+; CHECK-RV64-NEXT: lbu a4, 7(a1)
+; CHECK-RV64-NEXT: lbu a5, 4(a1)
+; CHECK-RV64-NEXT: slli a2, a2, 8
+; CHECK-RV64-NEXT: slli a3, a3, 16
+; CHECK-RV64-NEXT: slli a4, a4, 24
+; CHECK-RV64-NEXT: or a2, a2, a5
+; CHECK-RV64-NEXT: or a3, a4, a3
+; CHECK-RV64-NEXT: lbu a4, 1(a1)
+; CHECK-RV64-NEXT: lbu a5, 0(a1)
+; CHECK-RV64-NEXT: lbu a6, 2(a1)
+; CHECK-RV64-NEXT: lbu a1, 3(a1)
+; CHECK-RV64-NEXT: slli a4, a4, 8
+; CHECK-RV64-NEXT: or a4, a4, a5
+; CHECK-RV64-NEXT: slli a6, a6, 16
+; CHECK-RV64-NEXT: slli a1, a1, 24
+; CHECK-RV64-NEXT: or a1, a1, a6
+; CHECK-RV64-NEXT: or a2, a3, a2
+; CHECK-RV64-NEXT: or a1, a1, a4
+; CHECK-RV64-NEXT: pack a1, a1, a2
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v4i16_align1:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v4i16_align1:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT: sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <4 x i16>, ptr %a_ptr, align 1
+ store <4 x i16> %a, ptr %ret_ptr
+ ret void
+}
+
+define void @test_store_v4i16_align1(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v4i16_align1:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 4(a0)
+; CHECK-RV32-NEXT: lw a0, 0(a0)
+; CHECK-RV32-NEXT: srli a3, a2, 24
+; CHECK-RV32-NEXT: srli a4, a2, 16
+; CHECK-RV32-NEXT: srli a5, a2, 8
+; CHECK-RV32-NEXT: srli a6, a0, 24
+; CHECK-RV32-NEXT: srli a7, a0, 16
+; CHECK-RV32-NEXT: sb a2, 4(a1)
+; CHECK-RV32-NEXT: sb a5, 5(a1)
+; CHECK-RV32-NEXT: sb a4, 6(a1)
+; CHECK-RV32-NEXT: sb a3, 7(a1)
+; CHECK-RV32-NEXT: srli a2, a0, 8
+; CHECK-RV32-NEXT: sb a0, 0(a1)
+; CHECK-RV32-NEXT: sb a2, 1(a1)
+; CHECK-RV32-NEXT: sb a7, 2(a1)
+; CHECK-RV32-NEXT: sb a6, 3(a1)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_store_v4i16_align1:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ld a0, 0(a0)
+; CHECK-RV64-NEXT: srli a2, a0, 56
+; CHECK-RV64-NEXT: srli a3, a0, 48
+; CHECK-RV64-NEXT: srli a4, a0, 40
+; CHECK-RV64-NEXT: srli a5, a0, 32
+; CHECK-RV64-NEXT: srli a6, a0, 24
+; CHECK-RV64-NEXT: srli a7, a0, 16
+; CHECK-RV64-NEXT: sb a5, 4(a1)
+; CHECK-RV64-NEXT: sb a4, 5(a1)
+; CHECK-RV64-NEXT: sb a3, 6(a1)
+; CHECK-RV64-NEXT: sb a2, 7(a1)
+; CHECK-RV64-NEXT: srli a2, a0, 8
+; CHECK-RV64-NEXT: sb a0, 0(a1)
+; CHECK-RV64-NEXT: sb a2, 1(a1)
+; CHECK-RV64-NEXT: sb a7, 2(a1)
+; CHECK-RV64-NEXT: sb a6, 3(a1)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v4i16_align1:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v4i16_align1:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT: sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <4 x i16>, ptr %a_ptr
+ store <4 x i16> %a, ptr %b_ptr, align 1
+ ret void
+}
+
+define void @test_load_v4i16_align2(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v4i16_align2:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lhu a2, 2(a1)
+; CHECK-RV32-NEXT: lhu a3, 4(a1)
+; CHECK-RV32-NEXT: lhu a4, 6(a1)
+; CHECK-RV32-NEXT: lhu a1, 0(a1)
+; CHECK-RV32-NEXT: pack a3, a3, a4
+; CHECK-RV32-NEXT: pack a1, a1, a2
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: sw a3, 4(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_load_v4i16_align2:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lhu a2, 2(a1)
+; CHECK-RV64-NEXT: lhu a3, 0(a1)
+; CHECK-RV64-NEXT: lhu a4, 4(a1)
+; CHECK-RV64-NEXT: lhu a1, 6(a1)
+; CHECK-RV64-NEXT: slli a2, a2, 16
+; CHECK-RV64-NEXT: or a2, a2, a3
+; CHECK-RV64-NEXT: slli a4, a4, 32
+; CHECK-RV64-NEXT: slli a1, a1, 48
+; CHECK-RV64-NEXT: or a1, a1, a4
+; CHECK-RV64-NEXT: or a1, a1, a2
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v4i16_align2:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v4i16_align2:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT: sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <4 x i16>, ptr %a_ptr, align 2
+ store <4 x i16> %a, ptr %ret_ptr
+ ret void
+}
+
+define void @test_store_v4i16_align2(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v4i16_align2:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 4(a0)
+; CHECK-RV32-NEXT: lw a0, 0(a0)
+; CHECK-RV32-NEXT: srli a3, a2, 16
+; CHECK-RV32-NEXT: srli a4, a0, 16
+; CHECK-RV32-NEXT: sh a0, 0(a1)
+; CHECK-RV32-NEXT: sh a4, 2(a1)
+; CHECK-RV32-NEXT: sh a2, 4(a1)
+; CHECK-RV32-NEXT: sh a3, 6(a1)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_store_v4i16_align2:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ld a0, 0(a0)
+; CHECK-RV64-NEXT: srli a2, a0, 48
+; CHECK-RV64-NEXT: srli a3, a0, 32
+; CHECK-RV64-NEXT: srli a4, a0, 16
+; CHECK-RV64-NEXT: sh a0, 0(a1)
+; CHECK-RV64-NEXT: sh a4, 2(a1)
+; CHECK-RV64-NEXT: sh a3, 4(a1)
+; CHECK-RV64-NEXT: sh a2, 6(a1)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v4i16_align2:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v4i16_align2:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT: sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <4 x i16>, ptr %a_ptr
+ store <4 x i16> %a, ptr %b_ptr, align 2
+ ret void
+}
+
+define void @test_load_v4i16_align4(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v4i16_align4:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 0(a1)
+; CHECK-RV32-NEXT: lw a1, 4(a1)
+; CHECK-RV32-NEXT: sw a2, 0(a0)
+; CHECK-RV32-NEXT: sw a1, 4(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_load_v4i16_align4:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lw a2, 0(a1)
+; CHECK-RV64-NEXT: lw a1, 4(a1)
+; CHECK-RV64-NEXT: pack a1, a2, a1
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v4i16_align4:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v4i16_align4:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT: sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <4 x i16>, ptr %a_ptr, align 4
+ store <4 x i16> %a, ptr %ret_ptr
+ ret void
+}
+
+define void @test_store_v4i16_align4(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v4i16_align4:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 0(a0)
+; CHECK-RV32-NEXT: lw a0, 4(a0)
+; CHECK-RV32-NEXT: sw a2, 0(a1)
+; CHECK-RV32-NEXT: sw a0, 4(a1)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_store_v4i16_align4:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ld a0, 0(a0)
+; CHECK-RV64-NEXT: srli a2, a0, 32
+; CHECK-RV64-NEXT: sw a0, 0(a1)
+; CHECK-RV64-NEXT: sw a2, 4(a1)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v4i16_align4:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v4i16_align4:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT: sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <4 x i16>, ptr %a_ptr
+ store <4 x i16> %a, ptr %b_ptr, align 4
+ ret void
+}
+
+define void @test_load_v2i32_align1(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v2i32_align1:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lbu a2, 1(a1)
+; CHECK-RV32-NEXT: lbu a3, 2(a1)
+; CHECK-RV32-NEXT: lbu a4, 3(a1)
+; CHECK-RV32-NEXT: lbu a5, 0(a1)
+; CHECK-RV32-NEXT: slli a2, a2, 8
+; CHECK-RV32-NEXT: slli a3, a3, 16
+; CHECK-RV32-NEXT: slli a4, a4, 24
+; CHECK-RV32-NEXT: or a2, a2, a5
+; CHECK-RV32-NEXT: or a3, a4, a3
+; CHECK-RV32-NEXT: lbu a4, 5(a1)
+; CHECK-RV32-NEXT: lbu a5, 4(a1)
+; CHECK-RV32-NEXT: lbu a6, 6(a1)
+; CHECK-RV32-NEXT: lbu a1, 7(a1)
+; CHECK-RV32-NEXT: slli a4, a4, 8
+; CHECK-RV32-NEXT: or a4, a4, a5
+; CHECK-RV32-NEXT: slli a6, a6, 16
+; CHECK-RV32-NEXT: slli a1, a1, 24
+; CHECK-RV32-NEXT: or a1, a1, a6
+; CHECK-RV32-NEXT: or a2, a3, a2
+; CHECK-RV32-NEXT: or a1, a1, a4
+; CHECK-RV32-NEXT: sw a2, 0(a0)
+; CHECK-RV32-NEXT: sw a1, 4(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_load_v2i32_align1:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lbu a2, 5(a1)
+; CHECK-RV64-NEXT: lbu a3, 6(a1)
+; CHECK-RV64-NEXT: lbu a4, 7(a1)
+; CHECK-RV64-NEXT: lbu a5, 4(a1)
+; CHECK-RV64-NEXT: slli a2, a2, 8
+; CHECK-RV64-NEXT: slli a3, a3, 16
+; CHECK-RV64-NEXT: slli a4, a4, 24
+; CHECK-RV64-NEXT: or a2, a2, a5
+; CHECK-RV64-NEXT: or a3, a4, a3
+; CHECK-RV64-NEXT: lbu a4, 1(a1)
+; CHECK-RV64-NEXT: lbu a5, 0(a1)
+; CHECK-RV64-NEXT: lbu a6, 2(a1)
+; CHECK-RV64-NEXT: lbu a1, 3(a1)
+; CHECK-RV64-NEXT: slli a4, a4, 8
+; CHECK-RV64-NEXT: or a4, a4, a5
+; CHECK-RV64-NEXT: slli a6, a6, 16
+; CHECK-RV64-NEXT: slli a1, a1, 24
+; CHECK-RV64-NEXT: or a1, a1, a6
+; CHECK-RV64-NEXT: or a2, a3, a2
+; CHECK-RV64-NEXT: or a1, a1, a4
+; CHECK-RV64-NEXT: pack a1, a1, a2
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v2i32_align1:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v2i32_align1:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT: sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <2 x i32>, ptr %a_ptr, align 1
+ store <2 x i32> %a, ptr %ret_ptr
+ ret void
+}
+
+define void @test_store_v2i32_align1(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v2i32_align1:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 4(a0)
+; CHECK-RV32-NEXT: lw a0, 0(a0)
+; CHECK-RV32-NEXT: srli a3, a2, 24
+; CHECK-RV32-NEXT: srli a4, a2, 16
+; CHECK-RV32-NEXT: srli a5, a2, 8
+; CHECK-RV32-NEXT: srli a6, a0, 24
+; CHECK-RV32-NEXT: srli a7, a0, 16
+; CHECK-RV32-NEXT: sb a2, 4(a1)
+; CHECK-RV32-NEXT: sb a5, 5(a1)
+; CHECK-RV32-NEXT: sb a4, 6(a1)
+; CHECK-RV32-NEXT: sb a3, 7(a1)
+; CHECK-RV32-NEXT: srli a2, a0, 8
+; CHECK-RV32-NEXT: sb a0, 0(a1)
+; CHECK-RV32-NEXT: sb a2, 1(a1)
+; CHECK-RV32-NEXT: sb a7, 2(a1)
+; CHECK-RV32-NEXT: sb a6, 3(a1)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_store_v2i32_align1:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ld a0, 0(a0)
+; CHECK-RV64-NEXT: srli a2, a0, 56
+; CHECK-RV64-NEXT: srli a3, a0, 48
+; CHECK-RV64-NEXT: srli a4, a0, 40
+; CHECK-RV64-NEXT: srli a5, a0, 32
+; CHECK-RV64-NEXT: srli a6, a0, 24
+; CHECK-RV64-NEXT: srli a7, a0, 16
+; CHECK-RV64-NEXT: sb a5, 4(a1)
+; CHECK-RV64-NEXT: sb a4, 5(a1)
+; CHECK-RV64-NEXT: sb a3, 6(a1)
+; CHECK-RV64-NEXT: sb a2, 7(a1)
+; CHECK-RV64-NEXT: srli a2, a0, 8
+; CHECK-RV64-NEXT: sb a0, 0(a1)
+; CHECK-RV64-NEXT: sb a2, 1(a1)
+; CHECK-RV64-NEXT: sb a7, 2(a1)
+; CHECK-RV64-NEXT: sb a6, 3(a1)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v2i32_align1:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v2i32_align1:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT: sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <2 x i32>, ptr %a_ptr
+ store <2 x i32> %a, ptr %b_ptr, align 1
+ ret void
+}
+
+define void @test_load_v2i32_align2(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v2i32_align2:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lhu a2, 0(a1)
+; CHECK-RV32-NEXT: lhu a3, 2(a1)
+; CHECK-RV32-NEXT: lhu a4, 4(a1)
+; CHECK-RV32-NEXT: lhu a1, 6(a1)
+; CHECK-RV32-NEXT: pack a2, a2, a3
+; CHECK-RV32-NEXT: pack a1, a4, a1
+; CHECK-RV32-NEXT: sw a2, 0(a0)
+; CHECK-RV32-NEXT: sw a1, 4(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_load_v2i32_align2:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lhu a2, 2(a1)
+; CHECK-RV64-NEXT: lhu a3, 0(a1)
+; CHECK-RV64-NEXT: lhu a4, 4(a1)
+; CHECK-RV64-NEXT: lhu a1, 6(a1)
+; CHECK-RV64-NEXT: slli a2, a2, 16
+; CHECK-RV64-NEXT: or a2, a2, a3
+; CHECK-RV64-NEXT: slli a4, a4, 32
+; CHECK-RV64-NEXT: slli a1, a1, 48
+; CHECK-RV64-NEXT: or a1, a1, a4
+; CHECK-RV64-NEXT: or a1, a1, a2
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v2i32_align2:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v2i32_align2:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT: sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <2 x i32>, ptr %a_ptr, align 2
+ store <2 x i32> %a, ptr %ret_ptr
+ ret void
+}
+
+define void @test_store_v2i32_align2(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v2i32_align2:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 4(a0)
+; CHECK-RV32-NEXT: lw a0, 0(a0)
+; CHECK-RV32-NEXT: srli a3, a2, 16
+; CHECK-RV32-NEXT: srli a4, a0, 16
+; CHECK-RV32-NEXT: sh a0, 0(a1)
+; CHECK-RV32-NEXT: sh a4, 2(a1)
+; CHECK-RV32-NEXT: sh a2, 4(a1)
+; CHECK-RV32-NEXT: sh a3, 6(a1)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_store_v2i32_align2:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ld a0, 0(a0)
+; CHECK-RV64-NEXT: srli a2, a0, 48
+; CHECK-RV64-NEXT: srli a3, a0, 32
+; CHECK-RV64-NEXT: srli a4, a0, 16
+; CHECK-RV64-NEXT: sh a0, 0(a1)
+; CHECK-RV64-NEXT: sh a4, 2(a1)
+; CHECK-RV64-NEXT: sh a3, 4(a1)
+; CHECK-RV64-NEXT: sh a2, 6(a1)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v2i32_align2:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v2i32_align2:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT: sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <2 x i32>, ptr %a_ptr
+ store <2 x i32> %a, ptr %b_ptr, align 2
+ ret void
+}
+
+define void @test_load_v2i32_align4(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v2i32_align4:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 0(a1)
+; CHECK-RV32-NEXT: lw a1, 4(a1)
+; CHECK-RV32-NEXT: sw a2, 0(a0)
+; CHECK-RV32-NEXT: sw a1, 4(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_load_v2i32_align4:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lw a2, 0(a1)
+; CHECK-RV64-NEXT: lw a1, 4(a1)
+; CHECK-RV64-NEXT: pack a1, a2, a1
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v2i32_align4:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v2i32_align4:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT: sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <2 x i32>, ptr %a_ptr, align 4
+ store <2 x i32> %a, ptr %ret_ptr
+ ret void
+}
+
+define void @test_store_v2i32_align4(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v2i32_align4:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 0(a0)
+; CHECK-RV32-NEXT: lw a0, 4(a0)
+; CHECK-RV32-NEXT: sw a2, 0(a1)
+; CHECK-RV32-NEXT: sw a0, 4(a1)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_store_v2i32_align4:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ld a0, 0(a0)
+; CHECK-RV64-NEXT: srli a2, a0, 32
+; CHECK-RV64-NEXT: sw a0, 0(a1)
+; CHECK-RV64-NEXT: sw a2, 4(a1)
+; CHECK-RV64-NEXT: ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v2i32_align4:
+; UNALIGNED-RV32: # %bb.0:
+; UNALIGNED-RV32-NEXT: lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT: lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT: sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT: sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT: ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v2i32_align4:
+; UNALIGNED-RV64: # %bb.0:
+; UNALIGNED-RV64-NEXT: ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT: sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT: ret
+ %a = load <2 x i32>, ptr %a_ptr
+ store <2 x i32> %a, ptr %b_ptr, align 4
+ ret void
+}