test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll - llvm-project/llvm - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s

 ; FIXME: Currently, we avoid narrowing this v4i32 load, in the
 ; hopes of being able to fold the shift, despite it requiring stack
 ; storage + loads. Ideally, we should narrow here and load the i32
 ; directly from the variable offset e.g:
 ;
 ; add     x8, x0, x1, lsl #4
 ; and     x9, x2, #0x3
 ; ldr     w0, [x8, x9, lsl #2]
 ;
 ; The AArch64TargetLowering::shouldReduceLoadWidth heuristic should
 ; probably be updated to choose load-narrowing instead of folding the
 ; lsl in larger vector cases.
 ;
 define i32 @narrow_load_v4_i32_single_ele_variable_idx(ptr %ptr, i64 %off, i32 %ele) {
 ; CHECK-LABEL: narrow_load_v4_i32_single_ele_variable_idx:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [x0, x1, lsl #4]
 ; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    bfi x8, x2, #2, #2
 ; CHECK-NEXT:    str q0, [sp]
 ; CHECK-NEXT:    ldr w0, [x8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 entry:
   %idx = getelementptr inbounds <4 x i32>, ptr %ptr, i64 %off
   %x = load <4 x i32>, ptr %idx, align 8
   %res = extractelement <4 x i32> %x, i32 %ele
   ret i32 %res
 }
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
	; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s \| FileCheck %s

	; FIXME: Currently, we avoid narrowing this v4i32 load, in the
	; hopes of being able to fold the shift, despite it requiring stack
	; storage + loads. Ideally, we should narrow here and load the i32
	; directly from the variable offset e.g:
	;
	; add x8, x0, x1, lsl #4
	; and x9, x2, #0x3
	; ldr w0, [x8, x9, lsl #2]
	;
	; The AArch64TargetLowering::shouldReduceLoadWidth heuristic should
	; probably be updated to choose load-narrowing instead of folding the
	; lsl in larger vector cases.
	;
	define i32 @narrow_load_v4_i32_single_ele_variable_idx(ptr %ptr, i64 %off, i32 %ele) {
	; CHECK-LABEL: narrow_load_v4_i32_single_ele_variable_idx:
	; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: sub sp, sp, #16
	; CHECK-NEXT: .cfi_def_cfa_offset 16
	; CHECK-NEXT: ldr q0, [x0, x1, lsl #4]
	; CHECK-NEXT: mov x8, sp
	; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2
	; CHECK-NEXT: bfi x8, x2, #2, #2
	; CHECK-NEXT: str q0, [sp]
	; CHECK-NEXT: ldr w0, [x8]
	; CHECK-NEXT: add sp, sp, #16
	; CHECK-NEXT: ret
	entry:
	%idx = getelementptr inbounds <4 x i32>, ptr %ptr, i64 %off
	%x = load <4 x i32>, ptr %idx, align 8
	%res = extractelement <4 x i32> %x, i32 %ele
	ret i32 %res
	}