test/CodeGen/AArch64/sve-fixed-vector-zext.ll - llvm-project/llvm - Git at Google


 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v1 -O3 -opaque-pointers -aarch64-sve-vector-bits-min=256 -verify-machineinstrs | FileCheck %s --check-prefixes=SVE256
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v1 -O3 -opaque-pointers -aarch64-sve-vector-bits-min=128 -verify-machineinstrs | FileCheck %s --check-prefixes=NEON
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-n1 -O3 -opaque-pointers -verify-machineinstrs | FileCheck %s --check-prefixes=NEON
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v2 -O3 -opaque-pointers -verify-machineinstrs | FileCheck %s --check-prefixes=NEON

 define internal i32 @test(ptr nocapture readonly %p1, i32 %i1, ptr nocapture readonly %p2, i32 %i2) {
 ; SVE256-LABEL: test:
 ; SVE256:       ld1b    { z0.h }, p0/z,
 ; SVE256:       ld1b    { z1.h }, p0/z,
 ; SVE256:       sub z0.h, z0.h, z1.h
 ; SVE256-NEXT:  sunpklo z1.s, z0.h
 ; SVE256-NEXT:  ext z0.b, z0.b, z0.b, #16
 ; SVE256-NEXT:  sunpklo z0.s, z0.h
 ; SVE256-NEXT:  add z0.s, z1.s, z0.s
 ; SVE256-NEXT:  uaddv   d0, p1, z0.s

 ; NEON-LABEL: test:
 ; NEON:       ldr q0, [x0, w9, sxtw]
 ; NEON:       ldr q1, [x2, w10, sxtw]
 ; NEON:       usubl2  v2.8h, v0.16b, v1.16b
 ; NEON-NEXT:  usubl   v0.8h, v0.8b, v1.8b
 ; NEON:       saddl2  v1.4s, v0.8h, v2.8h
 ; NEON-NEXT:  saddl   v0.4s, v0.4h, v2.4h
 ; NEON-NEXT:  add v0.4s, v0.4s, v1.4s
 ; NEON-NEXT:  addv    s0, v0.4s

 L.entry:
   br label %L1

 L1:                                          ; preds = %L1, %L.entry
   %a = phi i32 [ 16, %L.entry ], [ %14, %L1 ]
   %b = phi i32 [ 0, %L.entry ], [ %13, %L1 ]
   %i = phi i32 [ 0, %L.entry ], [ %12, %L1 ]
   %0 = mul i32 %b, %i1
   %1 = sext i32 %0 to i64
   %2 = getelementptr i8, ptr %p1, i64 %1
   %3 = mul i32 %b, %i2
   %4 = sext i32 %3 to i64
   %5 = getelementptr i8, ptr %p2, i64 %4
   %6 = load <16 x i8>, ptr %2, align 1
   %7 = zext <16 x i8> %6 to <16 x i32>
   %8 = load <16 x i8>, ptr %5, align 1
   %9 = zext <16 x i8> %8 to <16 x i32>
   %10 = sub nsw <16 x i32> %7, %9
   %11 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %10)
   %12 = add i32 %11, %i
   %13 = add nuw nsw i32 %b, 1
   %14 = add nsw i32 %a, -1
   %.not = icmp eq i32 %14, 0
   br i1 %.not, label %L2, label %L1

 L2:                                          ; preds = %L1
   ret i32 %12
 }

 declare  i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)

	; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v1 -O3 -opaque-pointers -aarch64-sve-vector-bits-min=256 -verify-machineinstrs \| FileCheck %s --check-prefixes=SVE256
	; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v1 -O3 -opaque-pointers -aarch64-sve-vector-bits-min=128 -verify-machineinstrs \| FileCheck %s --check-prefixes=NEON
	; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-n1 -O3 -opaque-pointers -verify-machineinstrs \| FileCheck %s --check-prefixes=NEON
	; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v2 -O3 -opaque-pointers -verify-machineinstrs \| FileCheck %s --check-prefixes=NEON

	define internal i32 @test(ptr nocapture readonly %p1, i32 %i1, ptr nocapture readonly %p2, i32 %i2) {
	; SVE256-LABEL: test:
	; SVE256: ld1b { z0.h }, p0/z,
	; SVE256: ld1b { z1.h }, p0/z,
	; SVE256: sub z0.h, z0.h, z1.h
	; SVE256-NEXT: sunpklo z1.s, z0.h
	; SVE256-NEXT: ext z0.b, z0.b, z0.b, #16
	; SVE256-NEXT: sunpklo z0.s, z0.h
	; SVE256-NEXT: add z0.s, z1.s, z0.s
	; SVE256-NEXT: uaddv d0, p1, z0.s

	; NEON-LABEL: test:
	; NEON: ldr q0, [x0, w9, sxtw]
	; NEON: ldr q1, [x2, w10, sxtw]
	; NEON: usubl2 v2.8h, v0.16b, v1.16b
	; NEON-NEXT: usubl v0.8h, v0.8b, v1.8b
	; NEON: saddl2 v1.4s, v0.8h, v2.8h
	; NEON-NEXT: saddl v0.4s, v0.4h, v2.4h
	; NEON-NEXT: add v0.4s, v0.4s, v1.4s
	; NEON-NEXT: addv s0, v0.4s

	L.entry:
	br label %L1

	L1: ; preds = %L1, %L.entry
	%a = phi i32 [ 16, %L.entry ], [ %14, %L1 ]
	%b = phi i32 [ 0, %L.entry ], [ %13, %L1 ]
	%i = phi i32 [ 0, %L.entry ], [ %12, %L1 ]
	%0 = mul i32 %b, %i1
	%1 = sext i32 %0 to i64
	%2 = getelementptr i8, ptr %p1, i64 %1
	%3 = mul i32 %b, %i2
	%4 = sext i32 %3 to i64
	%5 = getelementptr i8, ptr %p2, i64 %4
	%6 = load <16 x i8>, ptr %2, align 1
	%7 = zext <16 x i8> %6 to <16 x i32>
	%8 = load <16 x i8>, ptr %5, align 1
	%9 = zext <16 x i8> %8 to <16 x i32>
	%10 = sub nsw <16 x i32> %7, %9
	%11 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %10)
	%12 = add i32 %11, %i
	%13 = add nuw nsw i32 %b, 1
	%14 = add nsw i32 %a, -1
	%.not = icmp eq i32 %14, 0
	br i1 %.not, label %L2, label %L1

	L2: ; preds = %L1
	ret i32 %12
	}

	declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)