[RISCV] Prevent P extension from creating unaligned scalar load/store insructions. (#174878)

The P extension requires us to use base ISA load/store instructions for
small vectors. We need to make sure we don't generate misaligned
instructions.

We'll need to do more work here if we want P and V to be enabled at the
same time, but that's a future problem.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index add1e71..17a4bc0 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -25365,7 +25365,7 @@
 bool RISCVTargetLowering::allowsMisalignedMemoryAccesses(
     EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
     unsigned *Fast) const {
-  if (!VT.isVector()) {
+  if (!VT.isVector() || Subtarget.enablePExtSIMDCodeGen()) {
     if (Fast)
       *Fast = Subtarget.enableUnalignedScalarMem();
     return Subtarget.enableUnalignedScalarMem();
diff --git a/llvm/test/CodeGen/RISCV/rvp-unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/rvp-unaligned-load-store.ll
new file mode 100644
index 0000000..32dd7bc
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvp-unaligned-load-store.ll
@@ -0,0 +1,1210 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-p \
+; RUN:   -riscv-enable-p-ext-simd-codegen < %s \
+; RUN:   | FileCheck --check-prefixes=CHECK,CHECK-RV32 %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-p \
+; RUN:   -riscv-enable-p-ext-simd-codegen < %s \
+; RUN:   | FileCheck --check-prefixes=CHECK,CHECK-RV64 %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-p,+unaligned-scalar-mem \
+; RUN:   -riscv-enable-p-ext-simd-codegen < %s \
+; RUN:   | FileCheck --check-prefixes=UNALIGNED,UNALIGNED-RV32 %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-p,+unaligned-scalar-mem \
+; RUN:   -riscv-enable-p-ext-simd-codegen < %s \
+; RUN:   | FileCheck --check-prefixes=UNALIGNED,UNALIGNED-RV64 %s
+
+define void @test_load_v4i8_align1(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v4i8_align1:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lbu a2, 5(a1)
+; CHECK-RV32-NEXT:    lbu a3, 6(a1)
+; CHECK-RV32-NEXT:    lbu a4, 7(a1)
+; CHECK-RV32-NEXT:    lbu a5, 4(a1)
+; CHECK-RV32-NEXT:    slli a2, a2, 8
+; CHECK-RV32-NEXT:    slli a3, a3, 16
+; CHECK-RV32-NEXT:    slli a4, a4, 24
+; CHECK-RV32-NEXT:    or a2, a2, a5
+; CHECK-RV32-NEXT:    or a3, a4, a3
+; CHECK-RV32-NEXT:    lbu a4, 1(a1)
+; CHECK-RV32-NEXT:    lbu a5, 0(a1)
+; CHECK-RV32-NEXT:    lbu a6, 2(a1)
+; CHECK-RV32-NEXT:    lbu a1, 3(a1)
+; CHECK-RV32-NEXT:    slli a4, a4, 8
+; CHECK-RV32-NEXT:    or a4, a4, a5
+; CHECK-RV32-NEXT:    slli a6, a6, 16
+; CHECK-RV32-NEXT:    slli a1, a1, 24
+; CHECK-RV32-NEXT:    or a1, a1, a6
+; CHECK-RV32-NEXT:    or a2, a3, a2
+; CHECK-RV32-NEXT:    or a1, a1, a4
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    sw a2, 4(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_load_v4i8_align1:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lbu a2, 5(a1)
+; CHECK-RV64-NEXT:    lbu a3, 6(a1)
+; CHECK-RV64-NEXT:    lbu a4, 7(a1)
+; CHECK-RV64-NEXT:    lbu a5, 4(a1)
+; CHECK-RV64-NEXT:    slli a2, a2, 8
+; CHECK-RV64-NEXT:    slli a3, a3, 16
+; CHECK-RV64-NEXT:    slli a4, a4, 24
+; CHECK-RV64-NEXT:    or a2, a2, a5
+; CHECK-RV64-NEXT:    or a3, a4, a3
+; CHECK-RV64-NEXT:    lbu a4, 1(a1)
+; CHECK-RV64-NEXT:    lbu a5, 0(a1)
+; CHECK-RV64-NEXT:    lbu a6, 2(a1)
+; CHECK-RV64-NEXT:    lbu a1, 3(a1)
+; CHECK-RV64-NEXT:    slli a4, a4, 8
+; CHECK-RV64-NEXT:    or a4, a4, a5
+; CHECK-RV64-NEXT:    slli a6, a6, 16
+; CHECK-RV64-NEXT:    slli a1, a1, 24
+; CHECK-RV64-NEXT:    or a1, a1, a6
+; CHECK-RV64-NEXT:    or a2, a3, a2
+; CHECK-RV64-NEXT:    or a1, a1, a4
+; CHECK-RV64-NEXT:    pack a1, a1, a2
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v4i8_align1:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v4i8_align1:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT:    sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr, align 1
+  store <8 x i8> %a, ptr %ret_ptr
+  ret void
+}
+
+define void @test_store_v4i8_align1(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v4i8_align1:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 4(a0)
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
+; CHECK-RV32-NEXT:    srli a3, a2, 24
+; CHECK-RV32-NEXT:    srli a4, a2, 16
+; CHECK-RV32-NEXT:    srli a5, a2, 8
+; CHECK-RV32-NEXT:    srli a6, a0, 24
+; CHECK-RV32-NEXT:    srli a7, a0, 16
+; CHECK-RV32-NEXT:    sb a2, 4(a1)
+; CHECK-RV32-NEXT:    sb a5, 5(a1)
+; CHECK-RV32-NEXT:    sb a4, 6(a1)
+; CHECK-RV32-NEXT:    sb a3, 7(a1)
+; CHECK-RV32-NEXT:    srli a2, a0, 8
+; CHECK-RV32-NEXT:    sb a0, 0(a1)
+; CHECK-RV32-NEXT:    sb a2, 1(a1)
+; CHECK-RV32-NEXT:    sb a7, 2(a1)
+; CHECK-RV32-NEXT:    sb a6, 3(a1)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_store_v4i8_align1:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-RV64-NEXT:    srli a2, a0, 56
+; CHECK-RV64-NEXT:    srli a3, a0, 48
+; CHECK-RV64-NEXT:    srli a4, a0, 40
+; CHECK-RV64-NEXT:    srli a5, a0, 32
+; CHECK-RV64-NEXT:    srli a6, a0, 24
+; CHECK-RV64-NEXT:    srli a7, a0, 16
+; CHECK-RV64-NEXT:    sb a5, 4(a1)
+; CHECK-RV64-NEXT:    sb a4, 5(a1)
+; CHECK-RV64-NEXT:    sb a3, 6(a1)
+; CHECK-RV64-NEXT:    sb a2, 7(a1)
+; CHECK-RV64-NEXT:    srli a2, a0, 8
+; CHECK-RV64-NEXT:    sb a0, 0(a1)
+; CHECK-RV64-NEXT:    sb a2, 1(a1)
+; CHECK-RV64-NEXT:    sb a7, 2(a1)
+; CHECK-RV64-NEXT:    sb a6, 3(a1)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v4i8_align1:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v4i8_align1:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT:    sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  store <8 x i8> %a, ptr %b_ptr, align 1
+  ret void
+}
+
+define void @test_load_v4i8_align2(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v4i8_align2:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lhu a2, 2(a1)
+; CHECK-RV32-NEXT:    lhu a3, 4(a1)
+; CHECK-RV32-NEXT:    lhu a4, 6(a1)
+; CHECK-RV32-NEXT:    lhu a1, 0(a1)
+; CHECK-RV32-NEXT:    pack a3, a3, a4
+; CHECK-RV32-NEXT:    pack a1, a1, a2
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    sw a3, 4(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_load_v4i8_align2:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lhu a2, 2(a1)
+; CHECK-RV64-NEXT:    lhu a3, 0(a1)
+; CHECK-RV64-NEXT:    lhu a4, 4(a1)
+; CHECK-RV64-NEXT:    lhu a1, 6(a1)
+; CHECK-RV64-NEXT:    slli a2, a2, 16
+; CHECK-RV64-NEXT:    or a2, a2, a3
+; CHECK-RV64-NEXT:    slli a4, a4, 32
+; CHECK-RV64-NEXT:    slli a1, a1, 48
+; CHECK-RV64-NEXT:    or a1, a1, a4
+; CHECK-RV64-NEXT:    or a1, a1, a2
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v4i8_align2:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v4i8_align2:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT:    sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr, align 2
+  store <8 x i8> %a, ptr %ret_ptr
+  ret void
+}
+
+define void @test_store_v4i8_align2(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v4i8_align2:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 4(a0)
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
+; CHECK-RV32-NEXT:    srli a3, a2, 16
+; CHECK-RV32-NEXT:    srli a4, a0, 16
+; CHECK-RV32-NEXT:    sh a0, 0(a1)
+; CHECK-RV32-NEXT:    sh a4, 2(a1)
+; CHECK-RV32-NEXT:    sh a2, 4(a1)
+; CHECK-RV32-NEXT:    sh a3, 6(a1)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_store_v4i8_align2:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-RV64-NEXT:    srli a2, a0, 48
+; CHECK-RV64-NEXT:    srli a3, a0, 32
+; CHECK-RV64-NEXT:    srli a4, a0, 16
+; CHECK-RV64-NEXT:    sh a0, 0(a1)
+; CHECK-RV64-NEXT:    sh a4, 2(a1)
+; CHECK-RV64-NEXT:    sh a3, 4(a1)
+; CHECK-RV64-NEXT:    sh a2, 6(a1)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v4i8_align2:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v4i8_align2:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT:    sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  store <8 x i8> %a, ptr %b_ptr, align 2
+  ret void
+}
+
+define void @test_load_v4i8_align4(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v4i8_align4:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 0(a1)
+; CHECK-RV32-NEXT:    lw a1, 4(a1)
+; CHECK-RV32-NEXT:    sw a2, 0(a0)
+; CHECK-RV32-NEXT:    sw a1, 4(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_load_v4i8_align4:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lw a2, 0(a1)
+; CHECK-RV64-NEXT:    lw a1, 4(a1)
+; CHECK-RV64-NEXT:    pack a1, a2, a1
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v4i8_align4:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v4i8_align4:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT:    sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr, align 4
+  store <8 x i8> %a, ptr %ret_ptr
+  ret void
+}
+
+define void @test_store_v4i8_align4(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v4i8_align4:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-RV32-NEXT:    lw a0, 4(a0)
+; CHECK-RV32-NEXT:    sw a2, 0(a1)
+; CHECK-RV32-NEXT:    sw a0, 4(a1)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_store_v4i8_align4:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-RV64-NEXT:    srli a2, a0, 32
+; CHECK-RV64-NEXT:    sw a0, 0(a1)
+; CHECK-RV64-NEXT:    sw a2, 4(a1)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v4i8_align4:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v4i8_align4:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT:    sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  store <8 x i8> %a, ptr %b_ptr, align 4
+  ret void
+}
+
+define void @test_load_v2i16_align1(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v2i16_align1:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lbu a2, 1(a1)
+; CHECK-RV32-NEXT:    lbu a3, 0(a1)
+; CHECK-RV32-NEXT:    lbu a4, 2(a1)
+; CHECK-RV32-NEXT:    lbu a1, 3(a1)
+; CHECK-RV32-NEXT:    slli a2, a2, 8
+; CHECK-RV32-NEXT:    or a2, a2, a3
+; CHECK-RV32-NEXT:    slli a4, a4, 16
+; CHECK-RV32-NEXT:    slli a1, a1, 24
+; CHECK-RV32-NEXT:    or a1, a1, a4
+; CHECK-RV32-NEXT:    or a1, a1, a2
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_load_v2i16_align1:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lbu a2, 1(a1)
+; CHECK-RV64-NEXT:    lbu a3, 0(a1)
+; CHECK-RV64-NEXT:    lbu a4, 2(a1)
+; CHECK-RV64-NEXT:    lb a1, 3(a1)
+; CHECK-RV64-NEXT:    slli a2, a2, 8
+; CHECK-RV64-NEXT:    or a2, a2, a3
+; CHECK-RV64-NEXT:    slli a4, a4, 16
+; CHECK-RV64-NEXT:    slli a1, a1, 24
+; CHECK-RV64-NEXT:    or a1, a1, a4
+; CHECK-RV64-NEXT:    or a1, a1, a2
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-LABEL: test_load_v2i16_align1:
+; UNALIGNED:       # %bb.0:
+; UNALIGNED-NEXT:    lw a1, 0(a1)
+; UNALIGNED-NEXT:    sw a1, 0(a0)
+; UNALIGNED-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr, align 1
+  store <2 x i16> %a, ptr %ret_ptr
+  ret void
+}
+
+define void @test_store_v2i16_align1(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_store_v2i16_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a0, 0(a0)
+; CHECK-NEXT:    srli a2, a0, 24
+; CHECK-NEXT:    srli a3, a0, 16
+; CHECK-NEXT:    srli a4, a0, 8
+; CHECK-NEXT:    sb a0, 0(a1)
+; CHECK-NEXT:    sb a4, 1(a1)
+; CHECK-NEXT:    sb a3, 2(a1)
+; CHECK-NEXT:    sb a2, 3(a1)
+; CHECK-NEXT:    ret
+;
+; UNALIGNED-LABEL: test_store_v2i16_align1:
+; UNALIGNED:       # %bb.0:
+; UNALIGNED-NEXT:    lw a0, 0(a0)
+; UNALIGNED-NEXT:    sw a0, 0(a1)
+; UNALIGNED-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  store <2 x i16> %a, ptr %b_ptr, align 1
+  ret void
+}
+
+define void @test_load_v2i16_align2(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v2i16_align2:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lhu a2, 0(a1)
+; CHECK-RV32-NEXT:    lhu a1, 2(a1)
+; CHECK-RV32-NEXT:    pack a1, a2, a1
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_load_v2i16_align2:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lh a2, 2(a1)
+; CHECK-RV64-NEXT:    lhu a1, 0(a1)
+; CHECK-RV64-NEXT:    slli a2, a2, 16
+; CHECK-RV64-NEXT:    or a1, a2, a1
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-LABEL: test_load_v2i16_align2:
+; UNALIGNED:       # %bb.0:
+; UNALIGNED-NEXT:    lw a1, 0(a1)
+; UNALIGNED-NEXT:    sw a1, 0(a0)
+; UNALIGNED-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr, align 2
+  store <2 x i16> %a, ptr %ret_ptr
+  ret void
+}
+
+define void @test_store_v2i16_align2(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_store_v2i16_align2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a0, 0(a0)
+; CHECK-NEXT:    srli a2, a0, 16
+; CHECK-NEXT:    sh a0, 0(a1)
+; CHECK-NEXT:    sh a2, 2(a1)
+; CHECK-NEXT:    ret
+;
+; UNALIGNED-LABEL: test_store_v2i16_align2:
+; UNALIGNED:       # %bb.0:
+; UNALIGNED-NEXT:    lw a0, 0(a0)
+; UNALIGNED-NEXT:    sw a0, 0(a1)
+; UNALIGNED-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  store <2 x i16> %a, ptr %b_ptr, align 2
+  ret void
+}
+
+define void @test_load_v8i8_align1(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v8i8_align1:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lbu a2, 5(a1)
+; CHECK-RV32-NEXT:    lbu a3, 6(a1)
+; CHECK-RV32-NEXT:    lbu a4, 7(a1)
+; CHECK-RV32-NEXT:    lbu a5, 4(a1)
+; CHECK-RV32-NEXT:    slli a2, a2, 8
+; CHECK-RV32-NEXT:    slli a3, a3, 16
+; CHECK-RV32-NEXT:    slli a4, a4, 24
+; CHECK-RV32-NEXT:    or a2, a2, a5
+; CHECK-RV32-NEXT:    or a3, a4, a3
+; CHECK-RV32-NEXT:    lbu a4, 1(a1)
+; CHECK-RV32-NEXT:    lbu a5, 0(a1)
+; CHECK-RV32-NEXT:    lbu a6, 2(a1)
+; CHECK-RV32-NEXT:    lbu a1, 3(a1)
+; CHECK-RV32-NEXT:    slli a4, a4, 8
+; CHECK-RV32-NEXT:    or a4, a4, a5
+; CHECK-RV32-NEXT:    slli a6, a6, 16
+; CHECK-RV32-NEXT:    slli a1, a1, 24
+; CHECK-RV32-NEXT:    or a1, a1, a6
+; CHECK-RV32-NEXT:    or a2, a3, a2
+; CHECK-RV32-NEXT:    or a1, a1, a4
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    sw a2, 4(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_load_v8i8_align1:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lbu a2, 5(a1)
+; CHECK-RV64-NEXT:    lbu a3, 6(a1)
+; CHECK-RV64-NEXT:    lbu a4, 7(a1)
+; CHECK-RV64-NEXT:    lbu a5, 4(a1)
+; CHECK-RV64-NEXT:    slli a2, a2, 8
+; CHECK-RV64-NEXT:    slli a3, a3, 16
+; CHECK-RV64-NEXT:    slli a4, a4, 24
+; CHECK-RV64-NEXT:    or a2, a2, a5
+; CHECK-RV64-NEXT:    or a3, a4, a3
+; CHECK-RV64-NEXT:    lbu a4, 1(a1)
+; CHECK-RV64-NEXT:    lbu a5, 0(a1)
+; CHECK-RV64-NEXT:    lbu a6, 2(a1)
+; CHECK-RV64-NEXT:    lbu a1, 3(a1)
+; CHECK-RV64-NEXT:    slli a4, a4, 8
+; CHECK-RV64-NEXT:    or a4, a4, a5
+; CHECK-RV64-NEXT:    slli a6, a6, 16
+; CHECK-RV64-NEXT:    slli a1, a1, 24
+; CHECK-RV64-NEXT:    or a1, a1, a6
+; CHECK-RV64-NEXT:    or a2, a3, a2
+; CHECK-RV64-NEXT:    or a1, a1, a4
+; CHECK-RV64-NEXT:    pack a1, a1, a2
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v8i8_align1:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v8i8_align1:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT:    sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr, align 1
+  store <8 x i8> %a, ptr %ret_ptr
+  ret void
+}
+
+define void @test_store_v8i8_align1(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v8i8_align1:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 4(a0)
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
+; CHECK-RV32-NEXT:    srli a3, a2, 24
+; CHECK-RV32-NEXT:    srli a4, a2, 16
+; CHECK-RV32-NEXT:    srli a5, a2, 8
+; CHECK-RV32-NEXT:    srli a6, a0, 24
+; CHECK-RV32-NEXT:    srli a7, a0, 16
+; CHECK-RV32-NEXT:    sb a2, 4(a1)
+; CHECK-RV32-NEXT:    sb a5, 5(a1)
+; CHECK-RV32-NEXT:    sb a4, 6(a1)
+; CHECK-RV32-NEXT:    sb a3, 7(a1)
+; CHECK-RV32-NEXT:    srli a2, a0, 8
+; CHECK-RV32-NEXT:    sb a0, 0(a1)
+; CHECK-RV32-NEXT:    sb a2, 1(a1)
+; CHECK-RV32-NEXT:    sb a7, 2(a1)
+; CHECK-RV32-NEXT:    sb a6, 3(a1)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_store_v8i8_align1:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-RV64-NEXT:    srli a2, a0, 56
+; CHECK-RV64-NEXT:    srli a3, a0, 48
+; CHECK-RV64-NEXT:    srli a4, a0, 40
+; CHECK-RV64-NEXT:    srli a5, a0, 32
+; CHECK-RV64-NEXT:    srli a6, a0, 24
+; CHECK-RV64-NEXT:    srli a7, a0, 16
+; CHECK-RV64-NEXT:    sb a5, 4(a1)
+; CHECK-RV64-NEXT:    sb a4, 5(a1)
+; CHECK-RV64-NEXT:    sb a3, 6(a1)
+; CHECK-RV64-NEXT:    sb a2, 7(a1)
+; CHECK-RV64-NEXT:    srli a2, a0, 8
+; CHECK-RV64-NEXT:    sb a0, 0(a1)
+; CHECK-RV64-NEXT:    sb a2, 1(a1)
+; CHECK-RV64-NEXT:    sb a7, 2(a1)
+; CHECK-RV64-NEXT:    sb a6, 3(a1)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v8i8_align1:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v8i8_align1:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT:    sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  store <8 x i8> %a, ptr %b_ptr, align 1
+  ret void
+}
+
+define void @test_load_v8i8_align2(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v8i8_align2:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lhu a2, 2(a1)
+; CHECK-RV32-NEXT:    lhu a3, 4(a1)
+; CHECK-RV32-NEXT:    lhu a4, 6(a1)
+; CHECK-RV32-NEXT:    lhu a1, 0(a1)
+; CHECK-RV32-NEXT:    pack a3, a3, a4
+; CHECK-RV32-NEXT:    pack a1, a1, a2
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    sw a3, 4(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_load_v8i8_align2:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lhu a2, 2(a1)
+; CHECK-RV64-NEXT:    lhu a3, 0(a1)
+; CHECK-RV64-NEXT:    lhu a4, 4(a1)
+; CHECK-RV64-NEXT:    lhu a1, 6(a1)
+; CHECK-RV64-NEXT:    slli a2, a2, 16
+; CHECK-RV64-NEXT:    or a2, a2, a3
+; CHECK-RV64-NEXT:    slli a4, a4, 32
+; CHECK-RV64-NEXT:    slli a1, a1, 48
+; CHECK-RV64-NEXT:    or a1, a1, a4
+; CHECK-RV64-NEXT:    or a1, a1, a2
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v8i8_align2:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v8i8_align2:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT:    sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr, align 2
+  store <8 x i8> %a, ptr %ret_ptr
+  ret void
+}
+
+define void @test_store_v8i8_align2(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v8i8_align2:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 4(a0)
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
+; CHECK-RV32-NEXT:    srli a3, a2, 16
+; CHECK-RV32-NEXT:    srli a4, a0, 16
+; CHECK-RV32-NEXT:    sh a0, 0(a1)
+; CHECK-RV32-NEXT:    sh a4, 2(a1)
+; CHECK-RV32-NEXT:    sh a2, 4(a1)
+; CHECK-RV32-NEXT:    sh a3, 6(a1)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_store_v8i8_align2:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-RV64-NEXT:    srli a2, a0, 48
+; CHECK-RV64-NEXT:    srli a3, a0, 32
+; CHECK-RV64-NEXT:    srli a4, a0, 16
+; CHECK-RV64-NEXT:    sh a0, 0(a1)
+; CHECK-RV64-NEXT:    sh a4, 2(a1)
+; CHECK-RV64-NEXT:    sh a3, 4(a1)
+; CHECK-RV64-NEXT:    sh a2, 6(a1)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v8i8_align2:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v8i8_align2:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT:    sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  store <8 x i8> %a, ptr %b_ptr, align 2
+  ret void
+}
+
+define void @test_load_v4i16_align1(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v4i16_align1:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lbu a2, 5(a1)
+; CHECK-RV32-NEXT:    lbu a3, 6(a1)
+; CHECK-RV32-NEXT:    lbu a4, 7(a1)
+; CHECK-RV32-NEXT:    lbu a5, 4(a1)
+; CHECK-RV32-NEXT:    slli a2, a2, 8
+; CHECK-RV32-NEXT:    slli a3, a3, 16
+; CHECK-RV32-NEXT:    slli a4, a4, 24
+; CHECK-RV32-NEXT:    or a2, a2, a5
+; CHECK-RV32-NEXT:    or a3, a4, a3
+; CHECK-RV32-NEXT:    lbu a4, 1(a1)
+; CHECK-RV32-NEXT:    lbu a5, 0(a1)
+; CHECK-RV32-NEXT:    lbu a6, 2(a1)
+; CHECK-RV32-NEXT:    lbu a1, 3(a1)
+; CHECK-RV32-NEXT:    slli a4, a4, 8
+; CHECK-RV32-NEXT:    or a4, a4, a5
+; CHECK-RV32-NEXT:    slli a6, a6, 16
+; CHECK-RV32-NEXT:    slli a1, a1, 24
+; CHECK-RV32-NEXT:    or a1, a1, a6
+; CHECK-RV32-NEXT:    or a2, a3, a2
+; CHECK-RV32-NEXT:    or a1, a1, a4
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    sw a2, 4(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_load_v4i16_align1:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lbu a2, 5(a1)
+; CHECK-RV64-NEXT:    lbu a3, 6(a1)
+; CHECK-RV64-NEXT:    lbu a4, 7(a1)
+; CHECK-RV64-NEXT:    lbu a5, 4(a1)
+; CHECK-RV64-NEXT:    slli a2, a2, 8
+; CHECK-RV64-NEXT:    slli a3, a3, 16
+; CHECK-RV64-NEXT:    slli a4, a4, 24
+; CHECK-RV64-NEXT:    or a2, a2, a5
+; CHECK-RV64-NEXT:    or a3, a4, a3
+; CHECK-RV64-NEXT:    lbu a4, 1(a1)
+; CHECK-RV64-NEXT:    lbu a5, 0(a1)
+; CHECK-RV64-NEXT:    lbu a6, 2(a1)
+; CHECK-RV64-NEXT:    lbu a1, 3(a1)
+; CHECK-RV64-NEXT:    slli a4, a4, 8
+; CHECK-RV64-NEXT:    or a4, a4, a5
+; CHECK-RV64-NEXT:    slli a6, a6, 16
+; CHECK-RV64-NEXT:    slli a1, a1, 24
+; CHECK-RV64-NEXT:    or a1, a1, a6
+; CHECK-RV64-NEXT:    or a2, a3, a2
+; CHECK-RV64-NEXT:    or a1, a1, a4
+; CHECK-RV64-NEXT:    pack a1, a1, a2
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v4i16_align1:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v4i16_align1:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT:    sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr, align 1
+  store <4 x i16> %a, ptr %ret_ptr
+  ret void
+}
+
+define void @test_store_v4i16_align1(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v4i16_align1:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 4(a0)
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
+; CHECK-RV32-NEXT:    srli a3, a2, 24
+; CHECK-RV32-NEXT:    srli a4, a2, 16
+; CHECK-RV32-NEXT:    srli a5, a2, 8
+; CHECK-RV32-NEXT:    srli a6, a0, 24
+; CHECK-RV32-NEXT:    srli a7, a0, 16
+; CHECK-RV32-NEXT:    sb a2, 4(a1)
+; CHECK-RV32-NEXT:    sb a5, 5(a1)
+; CHECK-RV32-NEXT:    sb a4, 6(a1)
+; CHECK-RV32-NEXT:    sb a3, 7(a1)
+; CHECK-RV32-NEXT:    srli a2, a0, 8
+; CHECK-RV32-NEXT:    sb a0, 0(a1)
+; CHECK-RV32-NEXT:    sb a2, 1(a1)
+; CHECK-RV32-NEXT:    sb a7, 2(a1)
+; CHECK-RV32-NEXT:    sb a6, 3(a1)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_store_v4i16_align1:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-RV64-NEXT:    srli a2, a0, 56
+; CHECK-RV64-NEXT:    srli a3, a0, 48
+; CHECK-RV64-NEXT:    srli a4, a0, 40
+; CHECK-RV64-NEXT:    srli a5, a0, 32
+; CHECK-RV64-NEXT:    srli a6, a0, 24
+; CHECK-RV64-NEXT:    srli a7, a0, 16
+; CHECK-RV64-NEXT:    sb a5, 4(a1)
+; CHECK-RV64-NEXT:    sb a4, 5(a1)
+; CHECK-RV64-NEXT:    sb a3, 6(a1)
+; CHECK-RV64-NEXT:    sb a2, 7(a1)
+; CHECK-RV64-NEXT:    srli a2, a0, 8
+; CHECK-RV64-NEXT:    sb a0, 0(a1)
+; CHECK-RV64-NEXT:    sb a2, 1(a1)
+; CHECK-RV64-NEXT:    sb a7, 2(a1)
+; CHECK-RV64-NEXT:    sb a6, 3(a1)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v4i16_align1:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v4i16_align1:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT:    sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  store <4 x i16> %a, ptr %b_ptr, align 1
+  ret void
+}
+
+define void @test_load_v4i16_align2(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v4i16_align2:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lhu a2, 2(a1)
+; CHECK-RV32-NEXT:    lhu a3, 4(a1)
+; CHECK-RV32-NEXT:    lhu a4, 6(a1)
+; CHECK-RV32-NEXT:    lhu a1, 0(a1)
+; CHECK-RV32-NEXT:    pack a3, a3, a4
+; CHECK-RV32-NEXT:    pack a1, a1, a2
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    sw a3, 4(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_load_v4i16_align2:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lhu a2, 2(a1)
+; CHECK-RV64-NEXT:    lhu a3, 0(a1)
+; CHECK-RV64-NEXT:    lhu a4, 4(a1)
+; CHECK-RV64-NEXT:    lhu a1, 6(a1)
+; CHECK-RV64-NEXT:    slli a2, a2, 16
+; CHECK-RV64-NEXT:    or a2, a2, a3
+; CHECK-RV64-NEXT:    slli a4, a4, 32
+; CHECK-RV64-NEXT:    slli a1, a1, 48
+; CHECK-RV64-NEXT:    or a1, a1, a4
+; CHECK-RV64-NEXT:    or a1, a1, a2
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v4i16_align2:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v4i16_align2:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT:    sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr, align 2
+  store <4 x i16> %a, ptr %ret_ptr
+  ret void
+}
+
+define void @test_store_v4i16_align2(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v4i16_align2:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 4(a0)
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
+; CHECK-RV32-NEXT:    srli a3, a2, 16
+; CHECK-RV32-NEXT:    srli a4, a0, 16
+; CHECK-RV32-NEXT:    sh a0, 0(a1)
+; CHECK-RV32-NEXT:    sh a4, 2(a1)
+; CHECK-RV32-NEXT:    sh a2, 4(a1)
+; CHECK-RV32-NEXT:    sh a3, 6(a1)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_store_v4i16_align2:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-RV64-NEXT:    srli a2, a0, 48
+; CHECK-RV64-NEXT:    srli a3, a0, 32
+; CHECK-RV64-NEXT:    srli a4, a0, 16
+; CHECK-RV64-NEXT:    sh a0, 0(a1)
+; CHECK-RV64-NEXT:    sh a4, 2(a1)
+; CHECK-RV64-NEXT:    sh a3, 4(a1)
+; CHECK-RV64-NEXT:    sh a2, 6(a1)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v4i16_align2:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v4i16_align2:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT:    sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  store <4 x i16> %a, ptr %b_ptr, align 2
+  ret void
+}
+
+define void @test_load_v4i16_align4(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v4i16_align4:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 0(a1)
+; CHECK-RV32-NEXT:    lw a1, 4(a1)
+; CHECK-RV32-NEXT:    sw a2, 0(a0)
+; CHECK-RV32-NEXT:    sw a1, 4(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_load_v4i16_align4:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lw a2, 0(a1)
+; CHECK-RV64-NEXT:    lw a1, 4(a1)
+; CHECK-RV64-NEXT:    pack a1, a2, a1
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v4i16_align4:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v4i16_align4:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT:    sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr, align 4
+  store <4 x i16> %a, ptr %ret_ptr
+  ret void
+}
+
+define void @test_store_v4i16_align4(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v4i16_align4:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-RV32-NEXT:    lw a0, 4(a0)
+; CHECK-RV32-NEXT:    sw a2, 0(a1)
+; CHECK-RV32-NEXT:    sw a0, 4(a1)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_store_v4i16_align4:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-RV64-NEXT:    srli a2, a0, 32
+; CHECK-RV64-NEXT:    sw a0, 0(a1)
+; CHECK-RV64-NEXT:    sw a2, 4(a1)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v4i16_align4:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v4i16_align4:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT:    sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  store <4 x i16> %a, ptr %b_ptr, align 4
+  ret void
+}
+
+define void @test_load_v2i32_align1(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v2i32_align1:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lbu a2, 1(a1)
+; CHECK-RV32-NEXT:    lbu a3, 2(a1)
+; CHECK-RV32-NEXT:    lbu a4, 3(a1)
+; CHECK-RV32-NEXT:    lbu a5, 0(a1)
+; CHECK-RV32-NEXT:    slli a2, a2, 8
+; CHECK-RV32-NEXT:    slli a3, a3, 16
+; CHECK-RV32-NEXT:    slli a4, a4, 24
+; CHECK-RV32-NEXT:    or a2, a2, a5
+; CHECK-RV32-NEXT:    or a3, a4, a3
+; CHECK-RV32-NEXT:    lbu a4, 5(a1)
+; CHECK-RV32-NEXT:    lbu a5, 4(a1)
+; CHECK-RV32-NEXT:    lbu a6, 6(a1)
+; CHECK-RV32-NEXT:    lbu a1, 7(a1)
+; CHECK-RV32-NEXT:    slli a4, a4, 8
+; CHECK-RV32-NEXT:    or a4, a4, a5
+; CHECK-RV32-NEXT:    slli a6, a6, 16
+; CHECK-RV32-NEXT:    slli a1, a1, 24
+; CHECK-RV32-NEXT:    or a1, a1, a6
+; CHECK-RV32-NEXT:    or a2, a3, a2
+; CHECK-RV32-NEXT:    or a1, a1, a4
+; CHECK-RV32-NEXT:    sw a2, 0(a0)
+; CHECK-RV32-NEXT:    sw a1, 4(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_load_v2i32_align1:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lbu a2, 5(a1)
+; CHECK-RV64-NEXT:    lbu a3, 6(a1)
+; CHECK-RV64-NEXT:    lbu a4, 7(a1)
+; CHECK-RV64-NEXT:    lbu a5, 4(a1)
+; CHECK-RV64-NEXT:    slli a2, a2, 8
+; CHECK-RV64-NEXT:    slli a3, a3, 16
+; CHECK-RV64-NEXT:    slli a4, a4, 24
+; CHECK-RV64-NEXT:    or a2, a2, a5
+; CHECK-RV64-NEXT:    or a3, a4, a3
+; CHECK-RV64-NEXT:    lbu a4, 1(a1)
+; CHECK-RV64-NEXT:    lbu a5, 0(a1)
+; CHECK-RV64-NEXT:    lbu a6, 2(a1)
+; CHECK-RV64-NEXT:    lbu a1, 3(a1)
+; CHECK-RV64-NEXT:    slli a4, a4, 8
+; CHECK-RV64-NEXT:    or a4, a4, a5
+; CHECK-RV64-NEXT:    slli a6, a6, 16
+; CHECK-RV64-NEXT:    slli a1, a1, 24
+; CHECK-RV64-NEXT:    or a1, a1, a6
+; CHECK-RV64-NEXT:    or a2, a3, a2
+; CHECK-RV64-NEXT:    or a1, a1, a4
+; CHECK-RV64-NEXT:    pack a1, a1, a2
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v2i32_align1:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v2i32_align1:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT:    sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <2 x i32>, ptr %a_ptr, align 1
+  store <2 x i32> %a, ptr %ret_ptr
+  ret void
+}
+
+define void @test_store_v2i32_align1(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v2i32_align1:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 4(a0)
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
+; CHECK-RV32-NEXT:    srli a3, a2, 24
+; CHECK-RV32-NEXT:    srli a4, a2, 16
+; CHECK-RV32-NEXT:    srli a5, a2, 8
+; CHECK-RV32-NEXT:    srli a6, a0, 24
+; CHECK-RV32-NEXT:    srli a7, a0, 16
+; CHECK-RV32-NEXT:    sb a2, 4(a1)
+; CHECK-RV32-NEXT:    sb a5, 5(a1)
+; CHECK-RV32-NEXT:    sb a4, 6(a1)
+; CHECK-RV32-NEXT:    sb a3, 7(a1)
+; CHECK-RV32-NEXT:    srli a2, a0, 8
+; CHECK-RV32-NEXT:    sb a0, 0(a1)
+; CHECK-RV32-NEXT:    sb a2, 1(a1)
+; CHECK-RV32-NEXT:    sb a7, 2(a1)
+; CHECK-RV32-NEXT:    sb a6, 3(a1)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_store_v2i32_align1:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-RV64-NEXT:    srli a2, a0, 56
+; CHECK-RV64-NEXT:    srli a3, a0, 48
+; CHECK-RV64-NEXT:    srli a4, a0, 40
+; CHECK-RV64-NEXT:    srli a5, a0, 32
+; CHECK-RV64-NEXT:    srli a6, a0, 24
+; CHECK-RV64-NEXT:    srli a7, a0, 16
+; CHECK-RV64-NEXT:    sb a5, 4(a1)
+; CHECK-RV64-NEXT:    sb a4, 5(a1)
+; CHECK-RV64-NEXT:    sb a3, 6(a1)
+; CHECK-RV64-NEXT:    sb a2, 7(a1)
+; CHECK-RV64-NEXT:    srli a2, a0, 8
+; CHECK-RV64-NEXT:    sb a0, 0(a1)
+; CHECK-RV64-NEXT:    sb a2, 1(a1)
+; CHECK-RV64-NEXT:    sb a7, 2(a1)
+; CHECK-RV64-NEXT:    sb a6, 3(a1)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v2i32_align1:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v2i32_align1:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT:    sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <2 x i32>, ptr %a_ptr
+  store <2 x i32> %a, ptr %b_ptr, align 1
+  ret void
+}
+
+define void @test_load_v2i32_align2(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v2i32_align2:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lhu a2, 0(a1)
+; CHECK-RV32-NEXT:    lhu a3, 2(a1)
+; CHECK-RV32-NEXT:    lhu a4, 4(a1)
+; CHECK-RV32-NEXT:    lhu a1, 6(a1)
+; CHECK-RV32-NEXT:    pack a2, a2, a3
+; CHECK-RV32-NEXT:    pack a1, a4, a1
+; CHECK-RV32-NEXT:    sw a2, 0(a0)
+; CHECK-RV32-NEXT:    sw a1, 4(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_load_v2i32_align2:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lhu a2, 2(a1)
+; CHECK-RV64-NEXT:    lhu a3, 0(a1)
+; CHECK-RV64-NEXT:    lhu a4, 4(a1)
+; CHECK-RV64-NEXT:    lhu a1, 6(a1)
+; CHECK-RV64-NEXT:    slli a2, a2, 16
+; CHECK-RV64-NEXT:    or a2, a2, a3
+; CHECK-RV64-NEXT:    slli a4, a4, 32
+; CHECK-RV64-NEXT:    slli a1, a1, 48
+; CHECK-RV64-NEXT:    or a1, a1, a4
+; CHECK-RV64-NEXT:    or a1, a1, a2
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v2i32_align2:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v2i32_align2:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT:    sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <2 x i32>, ptr %a_ptr, align 2
+  store <2 x i32> %a, ptr %ret_ptr
+  ret void
+}
+
+define void @test_store_v2i32_align2(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v2i32_align2:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 4(a0)
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
+; CHECK-RV32-NEXT:    srli a3, a2, 16
+; CHECK-RV32-NEXT:    srli a4, a0, 16
+; CHECK-RV32-NEXT:    sh a0, 0(a1)
+; CHECK-RV32-NEXT:    sh a4, 2(a1)
+; CHECK-RV32-NEXT:    sh a2, 4(a1)
+; CHECK-RV32-NEXT:    sh a3, 6(a1)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_store_v2i32_align2:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-RV64-NEXT:    srli a2, a0, 48
+; CHECK-RV64-NEXT:    srli a3, a0, 32
+; CHECK-RV64-NEXT:    srli a4, a0, 16
+; CHECK-RV64-NEXT:    sh a0, 0(a1)
+; CHECK-RV64-NEXT:    sh a4, 2(a1)
+; CHECK-RV64-NEXT:    sh a3, 4(a1)
+; CHECK-RV64-NEXT:    sh a2, 6(a1)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v2i32_align2:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v2i32_align2:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT:    sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <2 x i32>, ptr %a_ptr
+  store <2 x i32> %a, ptr %b_ptr, align 2
+  ret void
+}
+
+define void @test_load_v2i32_align4(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-RV32-LABEL: test_load_v2i32_align4:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 0(a1)
+; CHECK-RV32-NEXT:    lw a1, 4(a1)
+; CHECK-RV32-NEXT:    sw a2, 0(a0)
+; CHECK-RV32-NEXT:    sw a1, 4(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_load_v2i32_align4:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lw a2, 0(a1)
+; CHECK-RV64-NEXT:    lw a1, 4(a1)
+; CHECK-RV64-NEXT:    pack a1, a2, a1
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_load_v2i32_align4:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    lw a1, 4(a1)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    sw a1, 4(a0)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_load_v2i32_align4:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a1, 0(a1)
+; UNALIGNED-RV64-NEXT:    sd a1, 0(a0)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <2 x i32>, ptr %a_ptr, align 4
+  store <2 x i32> %a, ptr %ret_ptr
+  ret void
+}
+
+define void @test_store_v2i32_align4(ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_store_v2i32_align4:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-RV32-NEXT:    lw a0, 4(a0)
+; CHECK-RV32-NEXT:    sw a2, 0(a1)
+; CHECK-RV32-NEXT:    sw a0, 4(a1)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_store_v2i32_align4:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-RV64-NEXT:    srli a2, a0, 32
+; CHECK-RV64-NEXT:    sw a0, 0(a1)
+; CHECK-RV64-NEXT:    sw a2, 4(a1)
+; CHECK-RV64-NEXT:    ret
+;
+; UNALIGNED-RV32-LABEL: test_store_v2i32_align4:
+; UNALIGNED-RV32:       # %bb.0:
+; UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; UNALIGNED-RV32-NEXT:    lw a0, 4(a0)
+; UNALIGNED-RV32-NEXT:    sw a2, 0(a1)
+; UNALIGNED-RV32-NEXT:    sw a0, 4(a1)
+; UNALIGNED-RV32-NEXT:    ret
+;
+; UNALIGNED-RV64-LABEL: test_store_v2i32_align4:
+; UNALIGNED-RV64:       # %bb.0:
+; UNALIGNED-RV64-NEXT:    ld a0, 0(a0)
+; UNALIGNED-RV64-NEXT:    sd a0, 0(a1)
+; UNALIGNED-RV64-NEXT:    ret
+  %a = load <2 x i32>, ptr %a_ptr
+  store <2 x i32> %a, ptr %b_ptr, align 4
+  ret void
+}