| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s |
| |
| ; FIXME: Currently, we avoid narrowing this v4i32 load, in the |
| ; hopes of being able to fold the shift, despite it requiring stack |
| ; storage + loads. Ideally, we should narrow here and load the i32 |
| ; directly from the variable offset e.g: |
| ; |
| ; add x8, x0, x1, lsl #4 |
| ; and x9, x2, #0x3 |
| ; ldr w0, [x8, x9, lsl #2] |
| ; |
| ; The AArch64TargetLowering::shouldReduceLoadWidth heuristic should |
| ; probably be updated to choose load-narrowing instead of folding the |
| ; lsl in larger vector cases. |
| ; |
| define i32 @narrow_load_v4_i32_single_ele_variable_idx(ptr %ptr, i64 %off, i32 %ele) { |
| ; CHECK-LABEL: narrow_load_v4_i32_single_ele_variable_idx: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: sub sp, sp, #16 |
| ; CHECK-NEXT: .cfi_def_cfa_offset 16 |
| ; CHECK-NEXT: ldr q0, [x0, x1, lsl #4] |
| ; CHECK-NEXT: mov x8, sp |
| ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 |
| ; CHECK-NEXT: bfi x8, x2, #2, #2 |
| ; CHECK-NEXT: str q0, [sp] |
| ; CHECK-NEXT: ldr w0, [x8] |
| ; CHECK-NEXT: add sp, sp, #16 |
| ; CHECK-NEXT: ret |
| entry: |
| %idx = getelementptr inbounds <4 x i32>, ptr %ptr, i64 %off |
| %x = load <4 x i32>, ptr %idx, align 8 |
| %res = extractelement <4 x i32> %x, i32 %ele |
| ret i32 %res |
| } |