blob: 895acc51287fc58a443a48c4169e164016316efb [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -o - %s | FileCheck %s
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "arm64-apple-ios"
; It's profitable to convert the zext to a shuffle, which in turn will be
; lowered to 4 tbl instructions. The masks are materialized outside the loop.
define void @zext_v16i8_to_v16i32_in_loop(i8* %src, i32* %dst) {
; CHECK-LABEL: zext_v16i8_to_v16i32_in_loop:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: LBB0_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr q0, [x0, x8]
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: cmp x8, #128
; CHECK-NEXT: ushll2.8h v1, v0, #0
; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: ushll2.4s v2, v1, #0
; CHECK-NEXT: ushll.4s v1, v1, #0
; CHECK-NEXT: ushll2.4s v3, v0, #0
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: stp q1, q2, [x1, #32]
; CHECK-NEXT: stp q0, q3, [x1], #64
; CHECK-NEXT: b.ne LBB0_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
entry:
br label %loop
loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%src.gep = getelementptr i8, i8* %src, i64 %iv
%src.gep.cast = bitcast i8* %src.gep to <16 x i8>*
%load = load <16 x i8>, <16 x i8>* %src.gep.cast
%ext = zext <16 x i8> %load to <16 x i32>
%dst.gep = getelementptr i32, i32* %dst, i64 %iv
%dst.gep.cast = bitcast i32* %dst.gep to <16 x i32>*
store <16 x i32> %ext, <16 x i32>* %dst.gep.cast
%iv.next = add nuw i64 %iv, 16
%ec = icmp eq i64 %iv.next, 128
br i1 %ec, label %exit, label %loop
exit:
ret void
}
; Not profitable to use shuffle/tbl, as 4 tbls + materializing the masks
; require more instructions than lowering zext directly.
define void @zext_v16i8_to_v16i32_no_loop(i8* %src, i32* %dst) {
; CHECK-LABEL: zext_v16i8_to_v16i32_no_loop:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ushll2.8h v1, v0, #0
; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: ushll2.4s v2, v1, #0
; CHECK-NEXT: ushll.4s v1, v1, #0
; CHECK-NEXT: ushll2.4s v3, v0, #0
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: stp q1, q2, [x1, #32]
; CHECK-NEXT: stp q0, q3, [x1]
; CHECK-NEXT: ret
entry:
%src.cast = bitcast i8* %src to <16 x i8>*
%load = load <16 x i8>, <16 x i8>* %src.cast
%ext = zext <16 x i8> %load to <16 x i32>
%dst.cast = bitcast i32* %dst to <16 x i32>*
store <16 x i32> %ext, <16 x i32>* %dst.cast
ret void
}
define void @zext_v16i8_to_v16i16_in_loop(i8* %src, i16* %dst) {
; CHECK-LABEL: zext_v16i8_to_v16i16_in_loop:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: LBB2_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr q0, [x0, x8]
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: cmp x8, #128
; CHECK-NEXT: ushll2.8h v1, v0, #0
; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: stp q0, q1, [x1], #32
; CHECK-NEXT: b.ne LBB2_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
entry:
br label %loop
loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%src.gep = getelementptr i8, i8* %src, i64 %iv
%src.gep.cast = bitcast i8* %src.gep to <16 x i8>*
%load = load <16 x i8>, <16 x i8>* %src.gep.cast
%ext = zext <16 x i8> %load to <16 x i16>
%dst.gep = getelementptr i16, i16* %dst, i64 %iv
%dst.gep.cast = bitcast i16* %dst.gep to <16 x i16>*
store <16 x i16> %ext, <16 x i16>* %dst.gep.cast
%iv.next = add nuw i64 %iv, 16
%ec = icmp eq i64 %iv.next, 128
br i1 %ec, label %exit, label %loop
exit:
ret void
}
define void @zext_v8i8_to_v8i32_in_loop(i8* %src, i32* %dst) {
; CHECK-LABEL: zext_v8i8_to_v8i32_in_loop:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: LBB3_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr d0, [x0, x8]
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: cmp x8, #128
; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: ushll2.4s v1, v0, #0
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: stp q0, q1, [x1], #64
; CHECK-NEXT: b.ne LBB3_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
entry:
br label %loop
loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%src.gep = getelementptr i8, i8* %src, i64 %iv
%src.gep.cast = bitcast i8* %src.gep to <8 x i8>*
%load = load <8 x i8>, <8 x i8>* %src.gep.cast
%ext = zext <8 x i8> %load to <8 x i32>
%dst.gep = getelementptr i32, i32* %dst, i64 %iv
%dst.gep.cast = bitcast i32* %dst.gep to <8 x i32>*
store <8 x i32> %ext, <8 x i32>* %dst.gep.cast
%iv.next = add nuw i64 %iv, 16
%ec = icmp eq i64 %iv.next, 128
br i1 %ec, label %exit, label %loop
exit:
ret void
}
define void @zext_v16i8_to_v16i64_in_loop(i8* %src, i64* %dst) {
; CHECK-LABEL: zext_v16i8_to_v16i64_in_loop:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: LBB4_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr q0, [x0, x8]
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: cmp x8, #128
; CHECK-NEXT: ushll.8h v1, v0, #0
; CHECK-NEXT: ushll2.8h v0, v0, #0
; CHECK-NEXT: ushll2.4s v2, v1, #0
; CHECK-NEXT: ushll2.4s v3, v0, #0
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: ushll2.2d v4, v3, #0
; CHECK-NEXT: ushll2.2d v5, v0, #0
; CHECK-NEXT: ushll.2d v0, v0, #0
; CHECK-NEXT: ushll.2d v3, v3, #0
; CHECK-NEXT: stp q0, q5, [x1, #64]
; CHECK-NEXT: ushll.4s v0, v1, #0
; CHECK-NEXT: stp q3, q4, [x1, #96]
; CHECK-NEXT: ushll2.2d v3, v2, #0
; CHECK-NEXT: ushll.2d v2, v2, #0
; CHECK-NEXT: ushll2.2d v1, v0, #0
; CHECK-NEXT: ushll.2d v0, v0, #0
; CHECK-NEXT: stp q2, q3, [x1, #32]
; CHECK-NEXT: stp q0, q1, [x1], #128
; CHECK-NEXT: b.ne LBB4_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
entry:
br label %loop
loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%src.gep = getelementptr i8, i8* %src, i64 %iv
%src.gep.cast = bitcast i8* %src.gep to <16 x i8>*
%load = load <16 x i8>, <16 x i8>* %src.gep.cast
%ext = zext <16 x i8> %load to <16 x i64>
%dst.gep = getelementptr i64, i64* %dst, i64 %iv
%dst.gep.cast = bitcast i64* %dst.gep to <16 x i64>*
store <16 x i64> %ext, <16 x i64>* %dst.gep.cast
%iv.next = add nuw i64 %iv, 16
%ec = icmp eq i64 %iv.next, 128
br i1 %ec, label %exit, label %loop
exit:
ret void
}