| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NEON |
| ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE |
| ; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI |
| |
| ; CHECK-GI: warning: Instruction selection used fallback path for pmlsl2_v8i16_uzp1 |
| ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for pmlsl_pmlsl2_v8i16_uzp1 |
| |
| define <8 x i16> @smull_v8i8_v8i16(ptr %A, ptr %B) nounwind { |
| ; CHECK-LABEL: smull_v8i8_v8i16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldr d0, [x0] |
| ; CHECK-NEXT: ldr d1, [x1] |
| ; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b |
| ; CHECK-NEXT: ret |
| %tmp1 = load <8 x i8>, ptr %A |
| %tmp2 = load <8 x i8>, ptr %B |
| %tmp3 = sext <8 x i8> %tmp1 to <8 x i16> |
| %tmp4 = sext <8 x i8> %tmp2 to <8 x i16> |
| %tmp5 = mul <8 x i16> %tmp3, %tmp4 |
| ret <8 x i16> %tmp5 |
| } |
| |
| define <4 x i32> @smull_v4i16_v4i32(ptr %A, ptr %B) nounwind { |
| ; CHECK-LABEL: smull_v4i16_v4i32: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldr d0, [x0] |
| ; CHECK-NEXT: ldr d1, [x1] |
| ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h |
| ; CHECK-NEXT: ret |
| %tmp1 = load <4 x i16>, ptr %A |
| %tmp2 = load <4 x i16>, ptr %B |
| %tmp3 = sext <4 x i16> %tmp1 to <4 x i32> |
| %tmp4 = sext <4 x i16> %tmp2 to <4 x i32> |
| %tmp5 = mul <4 x i32> %tmp3, %tmp4 |
| ret <4 x i32> %tmp5 |
| } |
| |
| define <2 x i64> @smull_v2i32_v2i64(ptr %A, ptr %B) nounwind { |
| ; CHECK-LABEL: smull_v2i32_v2i64: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldr d0, [x0] |
| ; CHECK-NEXT: ldr d1, [x1] |
| ; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s |
| ; CHECK-NEXT: ret |
| %tmp1 = load <2 x i32>, ptr %A |
| %tmp2 = load <2 x i32>, ptr %B |
| %tmp3 = sext <2 x i32> %tmp1 to <2 x i64> |
| %tmp4 = sext <2 x i32> %tmp2 to <2 x i64> |
| %tmp5 = mul <2 x i64> %tmp3, %tmp4 |
| ret <2 x i64> %tmp5 |
| } |
| |
| define <8 x i32> @smull_zext_v8i8_v8i32(ptr %A, ptr %B) nounwind { |
| ; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: ldr d0, [x0] |
| ; CHECK-NEON-NEXT: ldr q2, [x1] |
| ; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0 |
| ; CHECK-NEON-NEXT: smull2 v1.4s, v0.8h, v2.8h |
| ; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v2.4h |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: ldr d0, [x0] |
| ; CHECK-SVE-NEXT: ldr q2, [x1] |
| ; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0 |
| ; CHECK-SVE-NEXT: smull2 v1.4s, v0.8h, v2.8h |
| ; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v2.4h |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: smull_zext_v8i8_v8i32: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: ldr d0, [x0] |
| ; CHECK-GI-NEXT: ldr q2, [x1] |
| ; CHECK-GI-NEXT: ushll v1.8h, v0.8b, #0 |
| ; CHECK-GI-NEXT: smull v0.4s, v1.4h, v2.4h |
| ; CHECK-GI-NEXT: smull2 v1.4s, v1.8h, v2.8h |
| ; CHECK-GI-NEXT: ret |
| %load.A = load <8 x i8>, ptr %A |
| %load.B = load <8 x i16>, ptr %B |
| %zext.A = zext <8 x i8> %load.A to <8 x i32> |
| %sext.B = sext <8 x i16> %load.B to <8 x i32> |
| %res = mul <8 x i32> %zext.A, %sext.B |
| ret <8 x i32> %res |
| } |
| |
| define <8 x i32> @smull_zext_v8i8_v8i32_sext_first_operand(ptr %A, ptr %B) nounwind { |
| ; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32_sext_first_operand: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: ldr d0, [x1] |
| ; CHECK-NEON-NEXT: ldr q2, [x0] |
| ; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0 |
| ; CHECK-NEON-NEXT: smull2 v1.4s, v2.8h, v0.8h |
| ; CHECK-NEON-NEXT: smull v0.4s, v2.4h, v0.4h |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32_sext_first_operand: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: ldr d0, [x1] |
| ; CHECK-SVE-NEXT: ldr q2, [x0] |
| ; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0 |
| ; CHECK-SVE-NEXT: smull2 v1.4s, v2.8h, v0.8h |
| ; CHECK-SVE-NEXT: smull v0.4s, v2.4h, v0.4h |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: smull_zext_v8i8_v8i32_sext_first_operand: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: ldr d0, [x1] |
| ; CHECK-GI-NEXT: ldr q2, [x0] |
| ; CHECK-GI-NEXT: ushll v1.8h, v0.8b, #0 |
| ; CHECK-GI-NEXT: smull v0.4s, v2.4h, v1.4h |
| ; CHECK-GI-NEXT: smull2 v1.4s, v2.8h, v1.8h |
| ; CHECK-GI-NEXT: ret |
| %load.A = load <8 x i16>, ptr %A |
| %load.B = load <8 x i8>, ptr %B |
| %sext.A = sext <8 x i16> %load.A to <8 x i32> |
| %zext.B = zext <8 x i8> %load.B to <8 x i32> |
| %res = mul <8 x i32> %sext.A, %zext.B |
| ret <8 x i32> %res |
| } |
| |
| define <8 x i32> @smull_zext_v8i8_v8i32_top_bit_is_1(ptr %A, ptr %B) nounwind { |
| ; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: ldr q0, [x0] |
| ; CHECK-NEON-NEXT: ldr q1, [x1] |
| ; CHECK-NEON-NEXT: orr v0.8h, #128, lsl #8 |
| ; CHECK-NEON-NEXT: sshll v3.4s, v1.4h, #0 |
| ; CHECK-NEON-NEXT: sshll2 v1.4s, v1.8h, #0 |
| ; CHECK-NEON-NEXT: ushll v2.4s, v0.4h, #0 |
| ; CHECK-NEON-NEXT: ushll2 v0.4s, v0.8h, #0 |
| ; CHECK-NEON-NEXT: mul v1.4s, v0.4s, v1.4s |
| ; CHECK-NEON-NEXT: mul v0.4s, v2.4s, v3.4s |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: ldr q0, [x0] |
| ; CHECK-SVE-NEXT: ldr q1, [x1] |
| ; CHECK-SVE-NEXT: orr v0.8h, #128, lsl #8 |
| ; CHECK-SVE-NEXT: sshll v3.4s, v1.4h, #0 |
| ; CHECK-SVE-NEXT: sshll2 v1.4s, v1.8h, #0 |
| ; CHECK-SVE-NEXT: ushll v2.4s, v0.4h, #0 |
| ; CHECK-SVE-NEXT: ushll2 v0.4s, v0.8h, #0 |
| ; CHECK-SVE-NEXT: mul v1.4s, v0.4s, v1.4s |
| ; CHECK-SVE-NEXT: mul v0.4s, v2.4s, v3.4s |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: movi v0.8h, #128, lsl #8 |
| ; CHECK-GI-NEXT: ldr q1, [x0] |
| ; CHECK-GI-NEXT: orr v0.16b, v1.16b, v0.16b |
| ; CHECK-GI-NEXT: ldr q1, [x1] |
| ; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0 |
| ; CHECK-GI-NEXT: ushll2 v3.4s, v0.8h, #0 |
| ; CHECK-GI-NEXT: sshll v0.4s, v1.4h, #0 |
| ; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 |
| ; CHECK-GI-NEXT: mul v0.4s, v2.4s, v0.4s |
| ; CHECK-GI-NEXT: mul v1.4s, v3.4s, v1.4s |
| ; CHECK-GI-NEXT: ret |
| %load.A = load <8 x i16>, ptr %A |
| %or.A = or <8 x i16> %load.A, <i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000> |
| %load.B = load <8 x i16>, ptr %B |
| %zext.A = zext <8 x i16> %or.A to <8 x i32> |
| %sext.B = sext <8 x i16> %load.B to <8 x i32> |
| %res = mul <8 x i32> %zext.A, %sext.B |
| ret <8 x i32> %res |
| } |
| |
| define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind { |
| ; CHECK-NEON-LABEL: smull_zext_v4i16_v4i32: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: ldr s0, [x0] |
| ; CHECK-NEON-NEXT: ldr d1, [x1] |
| ; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0 |
| ; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v1.4h |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: smull_zext_v4i16_v4i32: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: ldr s0, [x0] |
| ; CHECK-SVE-NEXT: ldr d1, [x1] |
| ; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0 |
| ; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v1.4h |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: smull_zext_v4i16_v4i32: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: ldr w8, [x0] |
| ; CHECK-GI-NEXT: fmov s0, w8 |
| ; CHECK-GI-NEXT: uxtb w8, w8 |
| ; CHECK-GI-NEXT: mov b1, v0.b[2] |
| ; CHECK-GI-NEXT: mov b2, v0.b[1] |
| ; CHECK-GI-NEXT: mov b3, v0.b[3] |
| ; CHECK-GI-NEXT: fmov s0, w8 |
| ; CHECK-GI-NEXT: fmov w9, s1 |
| ; CHECK-GI-NEXT: fmov w10, s2 |
| ; CHECK-GI-NEXT: fmov w11, s3 |
| ; CHECK-GI-NEXT: uxtb w9, w9 |
| ; CHECK-GI-NEXT: uxtb w10, w10 |
| ; CHECK-GI-NEXT: uxtb w11, w11 |
| ; CHECK-GI-NEXT: fmov s1, w9 |
| ; CHECK-GI-NEXT: mov v0.h[1], w10 |
| ; CHECK-GI-NEXT: mov v1.h[1], w11 |
| ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 |
| ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 |
| ; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v1.4h |
| ; CHECK-GI-NEXT: ldr d1, [x1] |
| ; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h |
| ; CHECK-GI-NEXT: ret |
| %load.A = load <4 x i8>, ptr %A |
| %load.B = load <4 x i16>, ptr %B |
| %zext.A = zext <4 x i8> %load.A to <4 x i32> |
| %sext.B = sext <4 x i16> %load.B to <4 x i32> |
| %res = mul <4 x i32> %zext.A, %sext.B |
| ret <4 x i32> %res |
| } |
| |
| define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind { |
| ; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: ldrh w8, [x0] |
| ; CHECK-NEON-NEXT: ldrh w9, [x0, #2] |
| ; CHECK-NEON-NEXT: ldr d1, [x1] |
| ; CHECK-NEON-NEXT: fmov d0, x8 |
| ; CHECK-NEON-NEXT: mov v0.d[1], x9 |
| ; CHECK-NEON-NEXT: xtn v0.2s, v0.2d |
| ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: ldrh w8, [x0] |
| ; CHECK-SVE-NEXT: ldrh w9, [x0, #2] |
| ; CHECK-SVE-NEXT: ldr d1, [x1] |
| ; CHECK-SVE-NEXT: fmov d0, x8 |
| ; CHECK-SVE-NEXT: mov v0.d[1], x9 |
| ; CHECK-SVE-NEXT: xtn v0.2s, v0.2d |
| ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: smull_zext_v2i32_v2i64: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: ld1 { v1.h }[0], [x0] |
| ; CHECK-GI-NEXT: ldr h2, [x0, #2] |
| ; CHECK-GI-NEXT: movi d0, #0x00ffff0000ffff |
| ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] |
| ; CHECK-GI-NEXT: and v0.8b, v1.8b, v0.8b |
| ; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] |
| ; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] |
| ; CHECK-GI-NEXT: ldr d0, [x1] |
| ; CHECK-GI-NEXT: smull v0.2d, v1.2s, v0.2s |
| ; CHECK-GI-NEXT: ret |
| %load.A = load <2 x i16>, ptr %A |
| %load.B = load <2 x i32>, ptr %B |
| %zext.A = zext <2 x i16> %load.A to <2 x i64> |
| %sext.B = sext <2 x i32> %load.B to <2 x i64> |
| %res = mul <2 x i64> %zext.A, %sext.B |
| ret <2 x i64> %res |
| } |
| |
| define <2 x i64> @smull_zext_and_v2i32_v2i64(ptr %A, ptr %B) nounwind { |
| ; CHECK-NEON-LABEL: smull_zext_and_v2i32_v2i64: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: ldr d0, [x0] |
| ; CHECK-NEON-NEXT: ldr d1, [x1] |
| ; CHECK-NEON-NEXT: bic v0.2s, #128, lsl #24 |
| ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: smull_zext_and_v2i32_v2i64: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: ldr d0, [x0] |
| ; CHECK-SVE-NEXT: ldr d1, [x1] |
| ; CHECK-SVE-NEXT: bic v0.2s, #128, lsl #24 |
| ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: smull_zext_and_v2i32_v2i64: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: mvni v0.2s, #128, lsl #24 |
| ; CHECK-GI-NEXT: ldr d1, [x0] |
| ; CHECK-GI-NEXT: and v0.8b, v1.8b, v0.8b |
| ; CHECK-GI-NEXT: ldr d1, [x1] |
| ; CHECK-GI-NEXT: smull v0.2d, v0.2s, v1.2s |
| ; CHECK-GI-NEXT: ret |
| %load.A = load <2 x i32>, ptr %A |
| %and.A = and <2 x i32> %load.A, <i32 u0x7FFFFFFF, i32 u0x7FFFFFFF> |
| %load.B = load <2 x i32>, ptr %B |
| %zext.A = zext <2 x i32> %and.A to <2 x i64> |
| %sext.B = sext <2 x i32> %load.B to <2 x i64> |
| %res = mul <2 x i64> %zext.A, %sext.B |
| ret <2 x i64> %res |
| } |
| |
| define <8 x i16> @umull_v8i8_v8i16(ptr %A, ptr %B) nounwind { |
| ; CHECK-LABEL: umull_v8i8_v8i16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldr d0, [x0] |
| ; CHECK-NEXT: ldr d1, [x1] |
| ; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b |
| ; CHECK-NEXT: ret |
| %tmp1 = load <8 x i8>, ptr %A |
| %tmp2 = load <8 x i8>, ptr %B |
| %tmp3 = zext <8 x i8> %tmp1 to <8 x i16> |
| %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> |
| %tmp5 = mul <8 x i16> %tmp3, %tmp4 |
| ret <8 x i16> %tmp5 |
| } |
| |
| define <4 x i32> @umull_v4i16_v4i32(ptr %A, ptr %B) nounwind { |
| ; CHECK-LABEL: umull_v4i16_v4i32: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldr d0, [x0] |
| ; CHECK-NEXT: ldr d1, [x1] |
| ; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h |
| ; CHECK-NEXT: ret |
| %tmp1 = load <4 x i16>, ptr %A |
| %tmp2 = load <4 x i16>, ptr %B |
| %tmp3 = zext <4 x i16> %tmp1 to <4 x i32> |
| %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> |
| %tmp5 = mul <4 x i32> %tmp3, %tmp4 |
| ret <4 x i32> %tmp5 |
| } |
| |
| define <2 x i64> @umull_v2i32_v2i64(ptr %A, ptr %B) nounwind { |
| ; CHECK-LABEL: umull_v2i32_v2i64: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldr d0, [x0] |
| ; CHECK-NEXT: ldr d1, [x1] |
| ; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s |
| ; CHECK-NEXT: ret |
| %tmp1 = load <2 x i32>, ptr %A |
| %tmp2 = load <2 x i32>, ptr %B |
| %tmp3 = zext <2 x i32> %tmp1 to <2 x i64> |
| %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> |
| %tmp5 = mul <2 x i64> %tmp3, %tmp4 |
| ret <2 x i64> %tmp5 |
| } |
| |
| define <8 x i16> @amull_v8i8_v8i16(ptr %A, ptr %B) nounwind { |
| ; CHECK-NEON-LABEL: amull_v8i8_v8i16: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: ldr d0, [x0] |
| ; CHECK-NEON-NEXT: ldr d1, [x1] |
| ; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b |
| ; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8 |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: amull_v8i8_v8i16: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: ldr d0, [x0] |
| ; CHECK-SVE-NEXT: ldr d1, [x1] |
| ; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b |
| ; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8 |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: amull_v8i8_v8i16: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: ldr d1, [x0] |
| ; CHECK-GI-NEXT: ldr d2, [x1] |
| ; CHECK-GI-NEXT: movi v0.2d, #0xff00ff00ff00ff |
| ; CHECK-GI-NEXT: umull v1.8h, v1.8b, v2.8b |
| ; CHECK-GI-NEXT: and v0.16b, v1.16b, v0.16b |
| ; CHECK-GI-NEXT: ret |
| %tmp1 = load <8 x i8>, ptr %A |
| %tmp2 = load <8 x i8>, ptr %B |
| %tmp3 = zext <8 x i8> %tmp1 to <8 x i16> |
| %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> |
| %tmp5 = mul <8 x i16> %tmp3, %tmp4 |
| %and = and <8 x i16> %tmp5, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> |
| ret <8 x i16> %and |
| } |
| |
| define <4 x i32> @amull_v4i16_v4i32(ptr %A, ptr %B) nounwind { |
| ; CHECK-NEON-LABEL: amull_v4i16_v4i32: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: ldr d1, [x0] |
| ; CHECK-NEON-NEXT: ldr d2, [x1] |
| ; CHECK-NEON-NEXT: movi v0.2d, #0x00ffff0000ffff |
| ; CHECK-NEON-NEXT: smull v1.4s, v1.4h, v2.4h |
| ; CHECK-NEON-NEXT: and v0.16b, v1.16b, v0.16b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: amull_v4i16_v4i32: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: ldr d1, [x0] |
| ; CHECK-SVE-NEXT: ldr d2, [x1] |
| ; CHECK-SVE-NEXT: movi v0.2d, #0x00ffff0000ffff |
| ; CHECK-SVE-NEXT: smull v1.4s, v1.4h, v2.4h |
| ; CHECK-SVE-NEXT: and v0.16b, v1.16b, v0.16b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: amull_v4i16_v4i32: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: ldr d1, [x0] |
| ; CHECK-GI-NEXT: ldr d2, [x1] |
| ; CHECK-GI-NEXT: movi v0.2d, #0x00ffff0000ffff |
| ; CHECK-GI-NEXT: umull v1.4s, v1.4h, v2.4h |
| ; CHECK-GI-NEXT: and v0.16b, v1.16b, v0.16b |
| ; CHECK-GI-NEXT: ret |
| %tmp1 = load <4 x i16>, ptr %A |
| %tmp2 = load <4 x i16>, ptr %B |
| %tmp3 = zext <4 x i16> %tmp1 to <4 x i32> |
| %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> |
| %tmp5 = mul <4 x i32> %tmp3, %tmp4 |
| %and = and <4 x i32> %tmp5, <i32 65535, i32 65535, i32 65535, i32 65535> |
| ret <4 x i32> %and |
| } |
| |
| define <2 x i64> @amull_v2i32_v2i64(ptr %A, ptr %B) nounwind { |
| ; CHECK-NEON-LABEL: amull_v2i32_v2i64: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: ldr d1, [x0] |
| ; CHECK-NEON-NEXT: ldr d2, [x1] |
| ; CHECK-NEON-NEXT: movi v0.2d, #0x000000ffffffff |
| ; CHECK-NEON-NEXT: smull v1.2d, v1.2s, v2.2s |
| ; CHECK-NEON-NEXT: and v0.16b, v1.16b, v0.16b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: amull_v2i32_v2i64: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: ldr d1, [x0] |
| ; CHECK-SVE-NEXT: ldr d2, [x1] |
| ; CHECK-SVE-NEXT: movi v0.2d, #0x000000ffffffff |
| ; CHECK-SVE-NEXT: smull v1.2d, v1.2s, v2.2s |
| ; CHECK-SVE-NEXT: and v0.16b, v1.16b, v0.16b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: amull_v2i32_v2i64: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: ldr d1, [x0] |
| ; CHECK-GI-NEXT: ldr d2, [x1] |
| ; CHECK-GI-NEXT: movi v0.2d, #0x000000ffffffff |
| ; CHECK-GI-NEXT: umull v1.2d, v1.2s, v2.2s |
| ; CHECK-GI-NEXT: and v0.16b, v1.16b, v0.16b |
| ; CHECK-GI-NEXT: ret |
| %tmp1 = load <2 x i32>, ptr %A |
| %tmp2 = load <2 x i32>, ptr %B |
| %tmp3 = zext <2 x i32> %tmp1 to <2 x i64> |
| %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> |
| %tmp5 = mul <2 x i64> %tmp3, %tmp4 |
| %and = and <2 x i64> %tmp5, <i64 4294967295, i64 4294967295> |
| ret <2 x i64> %and |
| } |
| |
| define <8 x i16> @smlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind { |
| ; CHECK-LABEL: smlal_v8i8_v8i16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldr q0, [x0] |
| ; CHECK-NEXT: ldr d1, [x1] |
| ; CHECK-NEXT: ldr d2, [x2] |
| ; CHECK-NEXT: smlal v0.8h, v1.8b, v2.8b |
| ; CHECK-NEXT: ret |
| %tmp1 = load <8 x i16>, ptr %A |
| %tmp2 = load <8 x i8>, ptr %B |
| %tmp3 = load <8 x i8>, ptr %C |
| %tmp4 = sext <8 x i8> %tmp2 to <8 x i16> |
| %tmp5 = sext <8 x i8> %tmp3 to <8 x i16> |
| %tmp6 = mul <8 x i16> %tmp4, %tmp5 |
| %tmp7 = add <8 x i16> %tmp1, %tmp6 |
| ret <8 x i16> %tmp7 |
| } |
| |
| define <4 x i32> @smlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind { |
| ; CHECK-LABEL: smlal_v4i16_v4i32: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldr q0, [x0] |
| ; CHECK-NEXT: ldr d1, [x1] |
| ; CHECK-NEXT: ldr d2, [x2] |
| ; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h |
| ; CHECK-NEXT: ret |
| %tmp1 = load <4 x i32>, ptr %A |
| %tmp2 = load <4 x i16>, ptr %B |
| %tmp3 = load <4 x i16>, ptr %C |
| %tmp4 = sext <4 x i16> %tmp2 to <4 x i32> |
| %tmp5 = sext <4 x i16> %tmp3 to <4 x i32> |
| %tmp6 = mul <4 x i32> %tmp4, %tmp5 |
| %tmp7 = add <4 x i32> %tmp1, %tmp6 |
| ret <4 x i32> %tmp7 |
| } |
| |
| define <2 x i64> @smlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind { |
| ; CHECK-LABEL: smlal_v2i32_v2i64: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldr q0, [x0] |
| ; CHECK-NEXT: ldr d1, [x1] |
| ; CHECK-NEXT: ldr d2, [x2] |
| ; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s |
| ; CHECK-NEXT: ret |
| %tmp1 = load <2 x i64>, ptr %A |
| %tmp2 = load <2 x i32>, ptr %B |
| %tmp3 = load <2 x i32>, ptr %C |
| %tmp4 = sext <2 x i32> %tmp2 to <2 x i64> |
| %tmp5 = sext <2 x i32> %tmp3 to <2 x i64> |
| %tmp6 = mul <2 x i64> %tmp4, %tmp5 |
| %tmp7 = add <2 x i64> %tmp1, %tmp6 |
| ret <2 x i64> %tmp7 |
| } |
| |
| define <8 x i16> @umlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind { |
| ; CHECK-LABEL: umlal_v8i8_v8i16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldr q0, [x0] |
| ; CHECK-NEXT: ldr d1, [x1] |
| ; CHECK-NEXT: ldr d2, [x2] |
| ; CHECK-NEXT: umlal v0.8h, v1.8b, v2.8b |
| ; CHECK-NEXT: ret |
| %tmp1 = load <8 x i16>, ptr %A |
| %tmp2 = load <8 x i8>, ptr %B |
| %tmp3 = load <8 x i8>, ptr %C |
| %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> |
| %tmp5 = zext <8 x i8> %tmp3 to <8 x i16> |
| %tmp6 = mul <8 x i16> %tmp4, %tmp5 |
| %tmp7 = add <8 x i16> %tmp1, %tmp6 |
| ret <8 x i16> %tmp7 |
| } |
| |
| define <4 x i32> @umlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind { |
| ; CHECK-LABEL: umlal_v4i16_v4i32: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldr q0, [x0] |
| ; CHECK-NEXT: ldr d1, [x1] |
| ; CHECK-NEXT: ldr d2, [x2] |
| ; CHECK-NEXT: umlal v0.4s, v1.4h, v2.4h |
| ; CHECK-NEXT: ret |
| %tmp1 = load <4 x i32>, ptr %A |
| %tmp2 = load <4 x i16>, ptr %B |
| %tmp3 = load <4 x i16>, ptr %C |
| %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> |
| %tmp5 = zext <4 x i16> %tmp3 to <4 x i32> |
| %tmp6 = mul <4 x i32> %tmp4, %tmp5 |
| %tmp7 = add <4 x i32> %tmp1, %tmp6 |
| ret <4 x i32> %tmp7 |
| } |
| |
| define <2 x i64> @umlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind { |
| ; CHECK-LABEL: umlal_v2i32_v2i64: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldr q0, [x0] |
| ; CHECK-NEXT: ldr d1, [x1] |
| ; CHECK-NEXT: ldr d2, [x2] |
| ; CHECK-NEXT: umlal v0.2d, v1.2s, v2.2s |
| ; CHECK-NEXT: ret |
| %tmp1 = load <2 x i64>, ptr %A |
| %tmp2 = load <2 x i32>, ptr %B |
| %tmp3 = load <2 x i32>, ptr %C |
| %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> |
| %tmp5 = zext <2 x i32> %tmp3 to <2 x i64> |
| %tmp6 = mul <2 x i64> %tmp4, %tmp5 |
| %tmp7 = add <2 x i64> %tmp1, %tmp6 |
| ret <2 x i64> %tmp7 |
| } |
| |
| define <8 x i16> @amlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind { |
| ; CHECK-NEON-LABEL: amlal_v8i8_v8i16: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: ldr q0, [x0] |
| ; CHECK-NEON-NEXT: ldr d1, [x1] |
| ; CHECK-NEON-NEXT: ldr d2, [x2] |
| ; CHECK-NEON-NEXT: smlal v0.8h, v1.8b, v2.8b |
| ; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8 |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: amlal_v8i8_v8i16: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: ldr q0, [x0] |
| ; CHECK-SVE-NEXT: ldr d1, [x1] |
| ; CHECK-SVE-NEXT: ldr d2, [x2] |
| ; CHECK-SVE-NEXT: smlal v0.8h, v1.8b, v2.8b |
| ; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8 |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: amlal_v8i8_v8i16: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: ldr q0, [x0] |
| ; CHECK-GI-NEXT: ldr d1, [x1] |
| ; CHECK-GI-NEXT: movi v3.2d, #0xff00ff00ff00ff |
| ; CHECK-GI-NEXT: ldr d2, [x2] |
| ; CHECK-GI-NEXT: umlal v0.8h, v1.8b, v2.8b |
| ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b |
| ; CHECK-GI-NEXT: ret |
| %tmp1 = load <8 x i16>, ptr %A |
| %tmp2 = load <8 x i8>, ptr %B |
| %tmp3 = load <8 x i8>, ptr %C |
| %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> |
| %tmp5 = zext <8 x i8> %tmp3 to <8 x i16> |
| %tmp6 = mul <8 x i16> %tmp4, %tmp5 |
| %tmp7 = add <8 x i16> %tmp1, %tmp6 |
| %and = and <8 x i16> %tmp7, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> |
| ret <8 x i16> %and |
| } |
| |
| define <4 x i32> @amlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind { |
| ; CHECK-NEON-LABEL: amlal_v4i16_v4i32: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: ldr q0, [x0] |
| ; CHECK-NEON-NEXT: ldr d1, [x1] |
| ; CHECK-NEON-NEXT: ldr d2, [x2] |
| ; CHECK-NEON-NEXT: smlal v0.4s, v1.4h, v2.4h |
| ; CHECK-NEON-NEXT: movi v1.2d, #0x00ffff0000ffff |
| ; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: amlal_v4i16_v4i32: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: ldr q0, [x0] |
| ; CHECK-SVE-NEXT: ldr d1, [x1] |
| ; CHECK-SVE-NEXT: ldr d2, [x2] |
| ; CHECK-SVE-NEXT: smlal v0.4s, v1.4h, v2.4h |
| ; CHECK-SVE-NEXT: movi v1.2d, #0x00ffff0000ffff |
| ; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: amlal_v4i16_v4i32: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: ldr q0, [x0] |
| ; CHECK-GI-NEXT: ldr d1, [x1] |
| ; CHECK-GI-NEXT: movi v3.2d, #0x00ffff0000ffff |
| ; CHECK-GI-NEXT: ldr d2, [x2] |
| ; CHECK-GI-NEXT: umlal v0.4s, v1.4h, v2.4h |
| ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b |
| ; CHECK-GI-NEXT: ret |
| %tmp1 = load <4 x i32>, ptr %A |
| %tmp2 = load <4 x i16>, ptr %B |
| %tmp3 = load <4 x i16>, ptr %C |
| %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> |
| %tmp5 = zext <4 x i16> %tmp3 to <4 x i32> |
| %tmp6 = mul <4 x i32> %tmp4, %tmp5 |
| %tmp7 = add <4 x i32> %tmp1, %tmp6 |
| %and = and <4 x i32> %tmp7, <i32 65535, i32 65535, i32 65535, i32 65535> |
| ret <4 x i32> %and |
| } |
| |
| define <2 x i64> @amlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind { |
| ; CHECK-NEON-LABEL: amlal_v2i32_v2i64: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: ldr q0, [x0] |
| ; CHECK-NEON-NEXT: ldr d1, [x1] |
| ; CHECK-NEON-NEXT: ldr d2, [x2] |
| ; CHECK-NEON-NEXT: smlal v0.2d, v1.2s, v2.2s |
| ; CHECK-NEON-NEXT: movi v1.2d, #0x000000ffffffff |
| ; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: amlal_v2i32_v2i64: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: ldr q0, [x0] |
| ; CHECK-SVE-NEXT: ldr d1, [x1] |
| ; CHECK-SVE-NEXT: ldr d2, [x2] |
| ; CHECK-SVE-NEXT: smlal v0.2d, v1.2s, v2.2s |
| ; CHECK-SVE-NEXT: movi v1.2d, #0x000000ffffffff |
| ; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: amlal_v2i32_v2i64: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: ldr q0, [x0] |
| ; CHECK-GI-NEXT: ldr d1, [x1] |
| ; CHECK-GI-NEXT: movi v3.2d, #0x000000ffffffff |
| ; CHECK-GI-NEXT: ldr d2, [x2] |
| ; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.2s |
| ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b |
| ; CHECK-GI-NEXT: ret |
| %tmp1 = load <2 x i64>, ptr %A |
| %tmp2 = load <2 x i32>, ptr %B |
| %tmp3 = load <2 x i32>, ptr %C |
| %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> |
| %tmp5 = zext <2 x i32> %tmp3 to <2 x i64> |
| %tmp6 = mul <2 x i64> %tmp4, %tmp5 |
| %tmp7 = add <2 x i64> %tmp1, %tmp6 |
| %and = and <2 x i64> %tmp7, <i64 4294967295, i64 4294967295> |
| ret <2 x i64> %and |
| } |
| |
| define <8 x i16> @smlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind { |
| ; CHECK-LABEL: smlsl_v8i8_v8i16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldr q0, [x0] |
| ; CHECK-NEXT: ldr d1, [x1] |
| ; CHECK-NEXT: ldr d2, [x2] |
| ; CHECK-NEXT: smlsl v0.8h, v1.8b, v2.8b |
| ; CHECK-NEXT: ret |
| %tmp1 = load <8 x i16>, ptr %A |
| %tmp2 = load <8 x i8>, ptr %B |
| %tmp3 = load <8 x i8>, ptr %C |
| %tmp4 = sext <8 x i8> %tmp2 to <8 x i16> |
| %tmp5 = sext <8 x i8> %tmp3 to <8 x i16> |
| %tmp6 = mul <8 x i16> %tmp4, %tmp5 |
| %tmp7 = sub <8 x i16> %tmp1, %tmp6 |
| ret <8 x i16> %tmp7 |
| } |
| |
| define <4 x i32> @smlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind { |
| ; CHECK-LABEL: smlsl_v4i16_v4i32: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldr q0, [x0] |
| ; CHECK-NEXT: ldr d1, [x1] |
| ; CHECK-NEXT: ldr d2, [x2] |
| ; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.4h |
| ; CHECK-NEXT: ret |
| %tmp1 = load <4 x i32>, ptr %A |
| %tmp2 = load <4 x i16>, ptr %B |
| %tmp3 = load <4 x i16>, ptr %C |
| %tmp4 = sext <4 x i16> %tmp2 to <4 x i32> |
| %tmp5 = sext <4 x i16> %tmp3 to <4 x i32> |
| %tmp6 = mul <4 x i32> %tmp4, %tmp5 |
| %tmp7 = sub <4 x i32> %tmp1, %tmp6 |
| ret <4 x i32> %tmp7 |
| } |
| |
| define <2 x i64> @smlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind { |
| ; CHECK-LABEL: smlsl_v2i32_v2i64: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldr q0, [x0] |
| ; CHECK-NEXT: ldr d1, [x1] |
| ; CHECK-NEXT: ldr d2, [x2] |
| ; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.2s |
| ; CHECK-NEXT: ret |
| %tmp1 = load <2 x i64>, ptr %A |
| %tmp2 = load <2 x i32>, ptr %B |
| %tmp3 = load <2 x i32>, ptr %C |
| %tmp4 = sext <2 x i32> %tmp2 to <2 x i64> |
| %tmp5 = sext <2 x i32> %tmp3 to <2 x i64> |
| %tmp6 = mul <2 x i64> %tmp4, %tmp5 |
| %tmp7 = sub <2 x i64> %tmp1, %tmp6 |
| ret <2 x i64> %tmp7 |
| } |
| |
| define <8 x i16> @umlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind { |
| ; CHECK-LABEL: umlsl_v8i8_v8i16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldr q0, [x0] |
| ; CHECK-NEXT: ldr d1, [x1] |
| ; CHECK-NEXT: ldr d2, [x2] |
| ; CHECK-NEXT: umlsl v0.8h, v1.8b, v2.8b |
| ; CHECK-NEXT: ret |
| %tmp1 = load <8 x i16>, ptr %A |
| %tmp2 = load <8 x i8>, ptr %B |
| %tmp3 = load <8 x i8>, ptr %C |
| %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> |
| %tmp5 = zext <8 x i8> %tmp3 to <8 x i16> |
| %tmp6 = mul <8 x i16> %tmp4, %tmp5 |
| %tmp7 = sub <8 x i16> %tmp1, %tmp6 |
| ret <8 x i16> %tmp7 |
| } |
| |
| define <4 x i32> @umlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind { |
| ; CHECK-LABEL: umlsl_v4i16_v4i32: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldr q0, [x0] |
| ; CHECK-NEXT: ldr d1, [x1] |
| ; CHECK-NEXT: ldr d2, [x2] |
| ; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.4h |
| ; CHECK-NEXT: ret |
| %tmp1 = load <4 x i32>, ptr %A |
| %tmp2 = load <4 x i16>, ptr %B |
| %tmp3 = load <4 x i16>, ptr %C |
| %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> |
| %tmp5 = zext <4 x i16> %tmp3 to <4 x i32> |
| %tmp6 = mul <4 x i32> %tmp4, %tmp5 |
| %tmp7 = sub <4 x i32> %tmp1, %tmp6 |
| ret <4 x i32> %tmp7 |
| } |
| |
| define <2 x i64> @umlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind { |
| ; CHECK-LABEL: umlsl_v2i32_v2i64: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldr q0, [x0] |
| ; CHECK-NEXT: ldr d1, [x1] |
| ; CHECK-NEXT: ldr d2, [x2] |
| ; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.2s |
| ; CHECK-NEXT: ret |
| %tmp1 = load <2 x i64>, ptr %A |
| %tmp2 = load <2 x i32>, ptr %B |
| %tmp3 = load <2 x i32>, ptr %C |
| %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> |
| %tmp5 = zext <2 x i32> %tmp3 to <2 x i64> |
| %tmp6 = mul <2 x i64> %tmp4, %tmp5 |
| %tmp7 = sub <2 x i64> %tmp1, %tmp6 |
| ret <2 x i64> %tmp7 |
| } |
| |
| define <8 x i16> @amlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind { |
| ; CHECK-NEON-LABEL: amlsl_v8i8_v8i16: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: ldr q0, [x0] |
| ; CHECK-NEON-NEXT: ldr d1, [x1] |
| ; CHECK-NEON-NEXT: ldr d2, [x2] |
| ; CHECK-NEON-NEXT: smlsl v0.8h, v1.8b, v2.8b |
| ; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8 |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: amlsl_v8i8_v8i16: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: ldr q0, [x0] |
| ; CHECK-SVE-NEXT: ldr d1, [x1] |
| ; CHECK-SVE-NEXT: ldr d2, [x2] |
| ; CHECK-SVE-NEXT: smlsl v0.8h, v1.8b, v2.8b |
| ; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8 |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: amlsl_v8i8_v8i16: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: ldr q0, [x0] |
| ; CHECK-GI-NEXT: ldr d1, [x1] |
| ; CHECK-GI-NEXT: movi v3.2d, #0xff00ff00ff00ff |
| ; CHECK-GI-NEXT: ldr d2, [x2] |
| ; CHECK-GI-NEXT: umlsl v0.8h, v1.8b, v2.8b |
| ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b |
| ; CHECK-GI-NEXT: ret |
| %tmp1 = load <8 x i16>, ptr %A |
| %tmp2 = load <8 x i8>, ptr %B |
| %tmp3 = load <8 x i8>, ptr %C |
| %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> |
| %tmp5 = zext <8 x i8> %tmp3 to <8 x i16> |
| %tmp6 = mul <8 x i16> %tmp4, %tmp5 |
| %tmp7 = sub <8 x i16> %tmp1, %tmp6 |
| %and = and <8 x i16> %tmp7, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> |
| ret <8 x i16> %and |
| } |
| |
| define <4 x i32> @amlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind { |
| ; CHECK-NEON-LABEL: amlsl_v4i16_v4i32: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: ldr q0, [x0] |
| ; CHECK-NEON-NEXT: ldr d1, [x1] |
| ; CHECK-NEON-NEXT: ldr d2, [x2] |
| ; CHECK-NEON-NEXT: smlsl v0.4s, v1.4h, v2.4h |
| ; CHECK-NEON-NEXT: movi v1.2d, #0x00ffff0000ffff |
| ; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: amlsl_v4i16_v4i32: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: ldr q0, [x0] |
| ; CHECK-SVE-NEXT: ldr d1, [x1] |
| ; CHECK-SVE-NEXT: ldr d2, [x2] |
| ; CHECK-SVE-NEXT: smlsl v0.4s, v1.4h, v2.4h |
| ; CHECK-SVE-NEXT: movi v1.2d, #0x00ffff0000ffff |
| ; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: amlsl_v4i16_v4i32: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: ldr q0, [x0] |
| ; CHECK-GI-NEXT: ldr d1, [x1] |
| ; CHECK-GI-NEXT: movi v3.2d, #0x00ffff0000ffff |
| ; CHECK-GI-NEXT: ldr d2, [x2] |
| ; CHECK-GI-NEXT: umlsl v0.4s, v1.4h, v2.4h |
| ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b |
| ; CHECK-GI-NEXT: ret |
| %tmp1 = load <4 x i32>, ptr %A |
| %tmp2 = load <4 x i16>, ptr %B |
| %tmp3 = load <4 x i16>, ptr %C |
| %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> |
| %tmp5 = zext <4 x i16> %tmp3 to <4 x i32> |
| %tmp6 = mul <4 x i32> %tmp4, %tmp5 |
| %tmp7 = sub <4 x i32> %tmp1, %tmp6 |
| %and = and <4 x i32> %tmp7, <i32 65535, i32 65535, i32 65535, i32 65535> |
| ret <4 x i32> %and |
| } |
| |
| define <2 x i64> @amlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind { |
| ; CHECK-NEON-LABEL: amlsl_v2i32_v2i64: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: ldr q0, [x0] |
| ; CHECK-NEON-NEXT: ldr d1, [x1] |
| ; CHECK-NEON-NEXT: ldr d2, [x2] |
| ; CHECK-NEON-NEXT: smlsl v0.2d, v1.2s, v2.2s |
| ; CHECK-NEON-NEXT: movi v1.2d, #0x000000ffffffff |
| ; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: amlsl_v2i32_v2i64: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: ldr q0, [x0] |
| ; CHECK-SVE-NEXT: ldr d1, [x1] |
| ; CHECK-SVE-NEXT: ldr d2, [x2] |
| ; CHECK-SVE-NEXT: smlsl v0.2d, v1.2s, v2.2s |
| ; CHECK-SVE-NEXT: movi v1.2d, #0x000000ffffffff |
| ; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: amlsl_v2i32_v2i64: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: ldr q0, [x0] |
| ; CHECK-GI-NEXT: ldr d1, [x1] |
| ; CHECK-GI-NEXT: movi v3.2d, #0x000000ffffffff |
| ; CHECK-GI-NEXT: ldr d2, [x2] |
| ; CHECK-GI-NEXT: umlsl v0.2d, v1.2s, v2.2s |
| ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b |
| ; CHECK-GI-NEXT: ret |
| %tmp1 = load <2 x i64>, ptr %A |
| %tmp2 = load <2 x i32>, ptr %B |
| %tmp3 = load <2 x i32>, ptr %C |
| %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> |
| %tmp5 = zext <2 x i32> %tmp3 to <2 x i64> |
| %tmp6 = mul <2 x i64> %tmp4, %tmp5 |
| %tmp7 = sub <2 x i64> %tmp1, %tmp6 |
| %and = and <2 x i64> %tmp7, <i64 4294967295, i64 4294967295> |
| ret <2 x i64> %and |
| } |
| |
| ; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements. |
| define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind { |
| ; CHECK-LABEL: smull_extvec_v8i8_v8i16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: movi v1.8b, #244 |
| ; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b |
| ; CHECK-NEXT: ret |
| %tmp3 = sext <8 x i8> %arg to <8 x i16> |
| %tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12> |
| ret <8 x i16> %tmp4 |
| } |
| |
| define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind { |
| ; Do not use SMULL if the BUILD_VECTOR element values are too big. |
| ; CHECK-NEON-LABEL: smull_noextvec_v8i8_v8i16: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: mov w8, #64537 // =0xfc19 |
| ; CHECK-NEON-NEXT: sshll v0.8h, v0.8b, #0 |
| ; CHECK-NEON-NEXT: dup v1.8h, w8 |
| ; CHECK-NEON-NEXT: mul v0.8h, v0.8h, v1.8h |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: smull_noextvec_v8i8_v8i16: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: mov w8, #64537 // =0xfc19 |
| ; CHECK-SVE-NEXT: sshll v0.8h, v0.8b, #0 |
| ; CHECK-SVE-NEXT: dup v1.8h, w8 |
| ; CHECK-SVE-NEXT: mul v0.8h, v0.8h, v1.8h |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: smull_noextvec_v8i8_v8i16: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: adrp x8, .LCPI34_0 |
| ; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 |
| ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI34_0] |
| ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h |
| ; CHECK-GI-NEXT: ret |
| %tmp3 = sext <8 x i8> %arg to <8 x i16> |
| %tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999> |
| ret <8 x i16> %tmp4 |
| } |
| |
| define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind { |
| ; CHECK-LABEL: smull_extvec_v4i16_v4i32: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mvni v1.4h, #11 |
| ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h |
| ; CHECK-NEXT: ret |
| %tmp3 = sext <4 x i16> %arg to <4 x i32> |
| %tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12> |
| ret <4 x i32> %tmp4 |
| } |
| |
| define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { |
| ; CHECK-NEON-LABEL: smull_extvec_v2i32_v2i64: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: mov w8, #-1234 // =0xfffffb2e |
| ; CHECK-NEON-NEXT: dup v1.2s, w8 |
| ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: smull_extvec_v2i32_v2i64: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: mov w8, #-1234 // =0xfffffb2e |
| ; CHECK-SVE-NEXT: dup v1.2s, w8 |
| ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: smull_extvec_v2i32_v2i64: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: adrp x8, .LCPI36_0 |
| ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI36_0] |
| ; CHECK-GI-NEXT: smull v0.2d, v0.2s, v1.2s |
| ; CHECK-GI-NEXT: ret |
| %tmp3 = sext <2 x i32> %arg to <2 x i64> |
| %tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234> |
| ret <2 x i64> %tmp4 |
| } |
| |
| define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind { |
| ; CHECK-LABEL: umull_extvec_v8i8_v8i16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: movi v1.8b, #12 |
| ; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b |
| ; CHECK-NEXT: ret |
| %tmp3 = zext <8 x i8> %arg to <8 x i16> |
| %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12> |
| ret <8 x i16> %tmp4 |
| } |
| |
| define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind { |
| ; Do not use SMULL if the BUILD_VECTOR element values are too big. |
| ; CHECK-NEON-LABEL: umull_noextvec_v8i8_v8i16: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: mov w8, #999 // =0x3e7 |
| ; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0 |
| ; CHECK-NEON-NEXT: dup v1.8h, w8 |
| ; CHECK-NEON-NEXT: mul v0.8h, v0.8h, v1.8h |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umull_noextvec_v8i8_v8i16: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: mov w8, #999 // =0x3e7 |
| ; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0 |
| ; CHECK-SVE-NEXT: dup v1.8h, w8 |
| ; CHECK-SVE-NEXT: mul v0.8h, v0.8h, v1.8h |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umull_noextvec_v8i8_v8i16: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: adrp x8, .LCPI38_0 |
| ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 |
| ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI38_0] |
| ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h |
| ; CHECK-GI-NEXT: ret |
| %tmp3 = zext <8 x i8> %arg to <8 x i16> |
| %tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999> |
| ret <8 x i16> %tmp4 |
| } |
| |
| define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind { |
| ; CHECK-NEON-LABEL: umull_extvec_v4i16_v4i32: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2 |
| ; CHECK-NEON-NEXT: dup v1.4h, w8 |
| ; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v1.4h |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umull_extvec_v4i16_v4i32: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2 |
| ; CHECK-SVE-NEXT: dup v1.4h, w8 |
| ; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v1.4h |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umull_extvec_v4i16_v4i32: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: adrp x8, .LCPI39_0 |
| ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI39_0] |
| ; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.4h |
| ; CHECK-GI-NEXT: ret |
| %tmp3 = zext <4 x i16> %arg to <4 x i32> |
| %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234> |
| ret <4 x i32> %tmp4 |
| } |
| |
| define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { |
| ; CHECK-NEON-LABEL: umull_extvec_v2i32_v2i64: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2 |
| ; CHECK-NEON-NEXT: dup v1.2s, w8 |
| ; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umull_extvec_v2i32_v2i64: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2 |
| ; CHECK-SVE-NEXT: dup v1.2s, w8 |
| ; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umull_extvec_v2i32_v2i64: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: adrp x8, .LCPI40_0 |
| ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI40_0] |
| ; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.2s |
| ; CHECK-GI-NEXT: ret |
| %tmp3 = zext <2 x i32> %arg to <2 x i64> |
| %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234> |
| ret <2 x i64> %tmp4 |
| } |
| |
| define <8 x i16> @amull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind { |
| ; CHECK-NEON-LABEL: amull_extvec_v8i8_v8i16: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: movi v1.8b, #12 |
| ; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b |
| ; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8 |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: amull_extvec_v8i8_v8i16: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: movi v1.8b, #12 |
| ; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b |
| ; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8 |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: amull_extvec_v8i8_v8i16: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: movi v1.8b, #12 |
| ; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff |
| ; CHECK-GI-NEXT: umull v0.8h, v0.8b, v1.8b |
| ; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b |
| ; CHECK-GI-NEXT: ret |
| %tmp3 = zext <8 x i8> %arg to <8 x i16> |
| %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12> |
| %and = and <8 x i16> %tmp4, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> |
| ret <8 x i16> %and |
| } |
| |
| define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind { |
| ; CHECK-NEON-LABEL: amull_extvec_v4i16_v4i32: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2 |
| ; CHECK-NEON-NEXT: dup v1.4h, w8 |
| ; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v1.4h |
| ; CHECK-NEON-NEXT: movi v1.2d, #0x00ffff0000ffff |
| ; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: amull_extvec_v4i16_v4i32: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2 |
| ; CHECK-SVE-NEXT: dup v1.4h, w8 |
| ; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v1.4h |
| ; CHECK-SVE-NEXT: movi v1.2d, #0x00ffff0000ffff |
| ; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: amull_extvec_v4i16_v4i32: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: adrp x8, .LCPI42_0 |
| ; CHECK-GI-NEXT: movi v2.2d, #0x00ffff0000ffff |
| ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI42_0] |
| ; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.4h |
| ; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b |
| ; CHECK-GI-NEXT: ret |
| %tmp3 = zext <4 x i16> %arg to <4 x i32> |
| %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234> |
| %and = and <4 x i32> %tmp4, <i32 65535, i32 65535, i32 65535, i32 65535> |
| ret <4 x i32> %and |
| } |
| |
| define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { |
| ; CHECK-NEON-LABEL: amull_extvec_v2i32_v2i64: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2 |
| ; CHECK-NEON-NEXT: dup v1.2s, w8 |
| ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s |
| ; CHECK-NEON-NEXT: movi v1.2d, #0x000000ffffffff |
| ; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: amull_extvec_v2i32_v2i64: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2 |
| ; CHECK-SVE-NEXT: dup v1.2s, w8 |
| ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s |
| ; CHECK-SVE-NEXT: movi v1.2d, #0x000000ffffffff |
| ; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: amull_extvec_v2i32_v2i64: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: adrp x8, .LCPI43_0 |
| ; CHECK-GI-NEXT: movi v2.2d, #0x000000ffffffff |
| ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI43_0] |
| ; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.2s |
| ; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b |
| ; CHECK-GI-NEXT: ret |
| %tmp3 = zext <2 x i32> %arg to <2 x i64> |
| %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234> |
| %and = and <2 x i64> %tmp4, <i64 4294967295, i64 4294967295> |
| ret <2 x i64> %and |
| } |
| |
| define i16 @smullWithInconsistentExtensions(<8 x i8> %x, <8 x i8> %y) { |
| ; If one operand has a zero-extend and the other a sign-extend, smull |
| ; cannot be used. |
| ; CHECK-LABEL: smullWithInconsistentExtensions: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 |
| ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 |
| ; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h |
| ; CHECK-NEXT: umov w0, v0.h[0] |
| ; CHECK-NEXT: ret |
| %s = sext <8 x i8> %x to <8 x i16> |
| %z = zext <8 x i8> %y to <8 x i16> |
| %m = mul <8 x i16> %s, %z |
| %r = extractelement <8 x i16> %m, i32 0 |
| ret i16 %r |
| } |
| |
| define <8 x i16> @smull_extended_vector_operand(<8 x i16> %v) { |
| ; CHECK-LABEL: smull_extended_vector_operand: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: movi v1.4s, #139, lsl #8 |
| ; CHECK-NEXT: sshll v2.4s, v0.4h, #0 |
| ; CHECK-NEXT: sshll2 v0.4s, v0.8h, #0 |
| ; CHECK-NEXT: mul v2.4s, v2.4s, v1.4s |
| ; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s |
| ; CHECK-NEXT: shrn v0.4h, v2.4s, #1 |
| ; CHECK-NEXT: shrn2 v0.8h, v1.4s, #1 |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = sext <8 x i16> %v to <8 x i32> |
| %1 = mul <8 x i32> %0, <i32 35584, i32 35584, i32 35584, i32 35584, i32 35584, i32 35584, i32 35584, i32 35584> |
| %2 = lshr <8 x i32> %1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> |
| %3 = trunc <8 x i32> %2 to <8 x i16> |
| ret <8 x i16> %3 |
| |
| } |
| |
| define void @distribute(ptr %dst, ptr %src, i32 %mul) nounwind { |
| ; CHECK-NEON-LABEL: distribute: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: ldr q0, [x1] |
| ; CHECK-NEON-NEXT: dup v1.8b, w2 |
| ; CHECK-NEON-NEXT: mov d2, v0.d[1] |
| ; CHECK-NEON-NEXT: umull v2.8h, v2.8b, v1.8b |
| ; CHECK-NEON-NEXT: umlal v2.8h, v0.8b, v1.8b |
| ; CHECK-NEON-NEXT: str q2, [x0] |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: distribute: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: ldr q0, [x1] |
| ; CHECK-SVE-NEXT: dup v1.8b, w2 |
| ; CHECK-SVE-NEXT: mov d2, v0.d[1] |
| ; CHECK-SVE-NEXT: umull v2.8h, v2.8b, v1.8b |
| ; CHECK-SVE-NEXT: umlal v2.8h, v0.8b, v1.8b |
| ; CHECK-SVE-NEXT: str q2, [x0] |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: distribute: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: ldr q0, [x1] |
| ; CHECK-GI-NEXT: dup v1.8b, w2 |
| ; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0 |
| ; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 |
| ; CHECK-GI-NEXT: uaddw2 v0.8h, v2.8h, v0.16b |
| ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h |
| ; CHECK-GI-NEXT: str q0, [x0] |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %0 = trunc i32 %mul to i8 |
| %1 = insertelement <8 x i8> undef, i8 %0, i32 0 |
| %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer |
| %3 = load <16 x i8>, ptr %src, align 1 |
| %4 = bitcast <16 x i8> %3 to <2 x double> |
| %5 = extractelement <2 x double> %4, i32 1 |
| %6 = bitcast double %5 to <8 x i8> |
| %7 = zext <8 x i8> %6 to <8 x i16> |
| %8 = zext <8 x i8> %2 to <8 x i16> |
| %9 = extractelement <2 x double> %4, i32 0 |
| %10 = bitcast double %9 to <8 x i8> |
| %11 = zext <8 x i8> %10 to <8 x i16> |
| %12 = add <8 x i16> %7, %11 |
| %13 = mul <8 x i16> %12, %8 |
| store <8 x i16> %13, ptr %dst, align 2 |
| ret void |
| } |
| |
| define <16 x i16> @umull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) { |
| ; CHECK-NEON-LABEL: umull2_i8: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: umull2 v2.8h, v0.16b, v1.16b |
| ; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b |
| ; CHECK-NEON-NEXT: mov v1.16b, v2.16b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umull2_i8: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: umull2 v2.8h, v0.16b, v1.16b |
| ; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b |
| ; CHECK-SVE-NEXT: mov v1.16b, v2.16b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umull2_i8: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: umull v2.8h, v0.8b, v1.8b |
| ; CHECK-GI-NEXT: umull2 v1.8h, v0.16b, v1.16b |
| ; CHECK-GI-NEXT: mov v0.16b, v2.16b |
| ; CHECK-GI-NEXT: ret |
| %arg1_ext = zext <16 x i8> %arg1 to <16 x i16> |
| %arg2_ext = zext <16 x i8> %arg2 to <16 x i16> |
| %mul = mul <16 x i16> %arg1_ext, %arg2_ext |
| ret <16 x i16> %mul |
| } |
| |
| define <16 x i16> @smull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) { |
| ; CHECK-NEON-LABEL: smull2_i8: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: smull2 v2.8h, v0.16b, v1.16b |
| ; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b |
| ; CHECK-NEON-NEXT: mov v1.16b, v2.16b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: smull2_i8: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: smull2 v2.8h, v0.16b, v1.16b |
| ; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b |
| ; CHECK-SVE-NEXT: mov v1.16b, v2.16b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: smull2_i8: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: smull v2.8h, v0.8b, v1.8b |
| ; CHECK-GI-NEXT: smull2 v1.8h, v0.16b, v1.16b |
| ; CHECK-GI-NEXT: mov v0.16b, v2.16b |
| ; CHECK-GI-NEXT: ret |
| %arg1_ext = sext <16 x i8> %arg1 to <16 x i16> |
| %arg2_ext = sext <16 x i8> %arg2 to <16 x i16> |
| %mul = mul <16 x i16> %arg1_ext, %arg2_ext |
| ret <16 x i16> %mul |
| } |
| |
| define <8 x i32> @umull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) { |
| ; CHECK-NEON-LABEL: umull2_i16: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: umull2 v2.4s, v0.8h, v1.8h |
| ; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v1.4h |
| ; CHECK-NEON-NEXT: mov v1.16b, v2.16b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umull2_i16: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: umull2 v2.4s, v0.8h, v1.8h |
| ; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v1.4h |
| ; CHECK-SVE-NEXT: mov v1.16b, v2.16b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umull2_i16: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: umull v2.4s, v0.4h, v1.4h |
| ; CHECK-GI-NEXT: umull2 v1.4s, v0.8h, v1.8h |
| ; CHECK-GI-NEXT: mov v0.16b, v2.16b |
| ; CHECK-GI-NEXT: ret |
| %arg1_ext = zext <8 x i16> %arg1 to <8 x i32> |
| %arg2_ext = zext <8 x i16> %arg2 to <8 x i32> |
| %mul = mul <8 x i32> %arg1_ext, %arg2_ext |
| ret <8 x i32> %mul |
| } |
| |
| define <8 x i32> @smull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) { |
| ; CHECK-NEON-LABEL: smull2_i16: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: smull2 v2.4s, v0.8h, v1.8h |
| ; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v1.4h |
| ; CHECK-NEON-NEXT: mov v1.16b, v2.16b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: smull2_i16: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: smull2 v2.4s, v0.8h, v1.8h |
| ; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v1.4h |
| ; CHECK-SVE-NEXT: mov v1.16b, v2.16b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: smull2_i16: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: smull v2.4s, v0.4h, v1.4h |
| ; CHECK-GI-NEXT: smull2 v1.4s, v0.8h, v1.8h |
| ; CHECK-GI-NEXT: mov v0.16b, v2.16b |
| ; CHECK-GI-NEXT: ret |
| %arg1_ext = sext <8 x i16> %arg1 to <8 x i32> |
| %arg2_ext = sext <8 x i16> %arg2 to <8 x i32> |
| %mul = mul <8 x i32> %arg1_ext, %arg2_ext |
| ret <8 x i32> %mul |
| } |
| |
| define <4 x i64> @umull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) { |
| ; CHECK-NEON-LABEL: umull2_i32: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: umull2 v2.2d, v0.4s, v1.4s |
| ; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s |
| ; CHECK-NEON-NEXT: mov v1.16b, v2.16b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umull2_i32: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: umull2 v2.2d, v0.4s, v1.4s |
| ; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s |
| ; CHECK-SVE-NEXT: mov v1.16b, v2.16b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umull2_i32: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: umull v2.2d, v0.2s, v1.2s |
| ; CHECK-GI-NEXT: umull2 v1.2d, v0.4s, v1.4s |
| ; CHECK-GI-NEXT: mov v0.16b, v2.16b |
| ; CHECK-GI-NEXT: ret |
| %arg1_ext = zext <4 x i32> %arg1 to <4 x i64> |
| %arg2_ext = zext <4 x i32> %arg2 to <4 x i64> |
| %mul = mul <4 x i64> %arg1_ext, %arg2_ext |
| ret <4 x i64> %mul |
| } |
| |
| define <4 x i64> @smull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) { |
| ; CHECK-NEON-LABEL: smull2_i32: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: smull2 v2.2d, v0.4s, v1.4s |
| ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s |
| ; CHECK-NEON-NEXT: mov v1.16b, v2.16b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: smull2_i32: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: smull2 v2.2d, v0.4s, v1.4s |
| ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s |
| ; CHECK-SVE-NEXT: mov v1.16b, v2.16b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: smull2_i32: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: smull v2.2d, v0.2s, v1.2s |
| ; CHECK-GI-NEXT: smull2 v1.2d, v0.4s, v1.4s |
| ; CHECK-GI-NEXT: mov v0.16b, v2.16b |
| ; CHECK-GI-NEXT: ret |
| %arg1_ext = sext <4 x i32> %arg1 to <4 x i64> |
| %arg2_ext = sext <4 x i32> %arg2 to <4 x i64> |
| %mul = mul <4 x i64> %arg1_ext, %arg2_ext |
| ret <4 x i64> %mul |
| } |
| |
| define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) { |
| ; CHECK-NEON-LABEL: amull2_i8: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: smull v2.8h, v0.8b, v1.8b |
| ; CHECK-NEON-NEXT: smull2 v1.8h, v0.16b, v1.16b |
| ; CHECK-NEON-NEXT: bic v2.8h, #255, lsl #8 |
| ; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8 |
| ; CHECK-NEON-NEXT: mov v0.16b, v2.16b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: amull2_i8: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: smull v2.8h, v0.8b, v1.8b |
| ; CHECK-SVE-NEXT: smull2 v1.8h, v0.16b, v1.16b |
| ; CHECK-SVE-NEXT: bic v2.8h, #255, lsl #8 |
| ; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8 |
| ; CHECK-SVE-NEXT: mov v0.16b, v2.16b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: amull2_i8: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff |
| ; CHECK-GI-NEXT: umull v3.8h, v0.8b, v1.8b |
| ; CHECK-GI-NEXT: umull2 v1.8h, v0.16b, v1.16b |
| ; CHECK-GI-NEXT: and v0.16b, v3.16b, v2.16b |
| ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b |
| ; CHECK-GI-NEXT: ret |
| %arg1_ext = zext <16 x i8> %arg1 to <16 x i16> |
| %arg2_ext = zext <16 x i8> %arg2 to <16 x i16> |
| %mul = mul <16 x i16> %arg1_ext, %arg2_ext |
| %and = and <16 x i16> %mul, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> |
| ret <16 x i16> %and |
| } |
| |
| define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) { |
| ; CHECK-NEON-LABEL: amull2_i16: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: movi v2.2d, #0x00ffff0000ffff |
| ; CHECK-NEON-NEXT: smull v3.4s, v0.4h, v1.4h |
| ; CHECK-NEON-NEXT: smull2 v0.4s, v0.8h, v1.8h |
| ; CHECK-NEON-NEXT: and v1.16b, v0.16b, v2.16b |
| ; CHECK-NEON-NEXT: and v0.16b, v3.16b, v2.16b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: amull2_i16: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: movi v2.2d, #0x00ffff0000ffff |
| ; CHECK-SVE-NEXT: smull v3.4s, v0.4h, v1.4h |
| ; CHECK-SVE-NEXT: smull2 v0.4s, v0.8h, v1.8h |
| ; CHECK-SVE-NEXT: and v1.16b, v0.16b, v2.16b |
| ; CHECK-SVE-NEXT: and v0.16b, v3.16b, v2.16b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: amull2_i16: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: movi v2.2d, #0x00ffff0000ffff |
| ; CHECK-GI-NEXT: umull v3.4s, v0.4h, v1.4h |
| ; CHECK-GI-NEXT: umull2 v1.4s, v0.8h, v1.8h |
| ; CHECK-GI-NEXT: and v0.16b, v3.16b, v2.16b |
| ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b |
| ; CHECK-GI-NEXT: ret |
| %arg1_ext = zext <8 x i16> %arg1 to <8 x i32> |
| %arg2_ext = zext <8 x i16> %arg2 to <8 x i32> |
| %mul = mul <8 x i32> %arg1_ext, %arg2_ext |
| %and = and <8 x i32> %mul, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> |
| ret <8 x i32> %and |
| } |
| |
| define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) { |
| ; CHECK-NEON-LABEL: amull2_i32: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: movi v2.2d, #0x000000ffffffff |
| ; CHECK-NEON-NEXT: smull v3.2d, v0.2s, v1.2s |
| ; CHECK-NEON-NEXT: smull2 v0.2d, v0.4s, v1.4s |
| ; CHECK-NEON-NEXT: and v1.16b, v0.16b, v2.16b |
| ; CHECK-NEON-NEXT: and v0.16b, v3.16b, v2.16b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: amull2_i32: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: movi v2.2d, #0x000000ffffffff |
| ; CHECK-SVE-NEXT: smull v3.2d, v0.2s, v1.2s |
| ; CHECK-SVE-NEXT: smull2 v0.2d, v0.4s, v1.4s |
| ; CHECK-SVE-NEXT: and v1.16b, v0.16b, v2.16b |
| ; CHECK-SVE-NEXT: and v0.16b, v3.16b, v2.16b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: amull2_i32: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: movi v2.2d, #0x000000ffffffff |
| ; CHECK-GI-NEXT: umull v3.2d, v0.2s, v1.2s |
| ; CHECK-GI-NEXT: umull2 v1.2d, v0.4s, v1.4s |
| ; CHECK-GI-NEXT: and v0.16b, v3.16b, v2.16b |
| ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b |
| ; CHECK-GI-NEXT: ret |
| %arg1_ext = zext <4 x i32> %arg1 to <4 x i64> |
| %arg2_ext = zext <4 x i32> %arg2 to <4 x i64> |
| %mul = mul <4 x i64> %arg1_ext, %arg2_ext |
| %and = and <4 x i64> %mul, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> |
| ret <4 x i64> %and |
| } |
| |
| |
| define <8 x i16> @umull_and_v8i16(<8 x i8> %src1, <8 x i16> %src2) { |
| ; CHECK-NEON-LABEL: umull_and_v8i16: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8 |
| ; CHECK-NEON-NEXT: xtn v1.8b, v1.8h |
| ; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umull_and_v8i16: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8 |
| ; CHECK-SVE-NEXT: xtn v1.8b, v1.8h |
| ; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umull_and_v8i16: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff |
| ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b |
| ; CHECK-GI-NEXT: xtn v1.8b, v1.8h |
| ; CHECK-GI-NEXT: umull v0.8h, v0.8b, v1.8b |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %in1 = zext <8 x i8> %src1 to <8 x i16> |
| %in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> |
| %out = mul nsw <8 x i16> %in1, %in2 |
| ret <8 x i16> %out |
| } |
| |
| define <8 x i16> @umull_and_v8i16_c(<8 x i8> %src1, <8 x i16> %src2) { |
| ; CHECK-NEON-LABEL: umull_and_v8i16_c: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8 |
| ; CHECK-NEON-NEXT: xtn v1.8b, v1.8h |
| ; CHECK-NEON-NEXT: umull v0.8h, v1.8b, v0.8b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umull_and_v8i16_c: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8 |
| ; CHECK-SVE-NEXT: xtn v1.8b, v1.8h |
| ; CHECK-SVE-NEXT: umull v0.8h, v1.8b, v0.8b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umull_and_v8i16_c: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff |
| ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b |
| ; CHECK-GI-NEXT: xtn v1.8b, v1.8h |
| ; CHECK-GI-NEXT: umull v0.8h, v1.8b, v0.8b |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %in1 = zext <8 x i8> %src1 to <8 x i16> |
| %in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> |
| %out = mul nsw <8 x i16> %in2, %in1 |
| ret <8 x i16> %out |
| } |
| |
| define <8 x i16> @umull_and256_v8i16(<8 x i8> %src1, <8 x i16> %src2) { |
| ; CHECK-LABEL: umull_and256_v8i16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: movi v2.8h, #1, lsl #8 |
| ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 |
| ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b |
| ; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h |
| ; CHECK-NEXT: ret |
| entry: |
| %in1 = zext <8 x i8> %src1 to <8 x i16> |
| %in2 = and <8 x i16> %src2, <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256> |
| %out = mul nsw <8 x i16> %in1, %in2 |
| ret <8 x i16> %out |
| } |
| |
| define <8 x i16> @umull_andconst_v8i16(<8 x i8> %src1, <8 x i16> %src2) { |
| ; CHECK-NEON-LABEL: umull_andconst_v8i16: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: movi v1.2d, #0xffffffffffffffff |
| ; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umull_andconst_v8i16: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: movi v1.2d, #0xffffffffffffffff |
| ; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umull_andconst_v8i16: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: movi d1, #0xffffffffffffffff |
| ; CHECK-GI-NEXT: umull v0.8h, v0.8b, v1.8b |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %in1 = zext <8 x i8> %src1 to <8 x i16> |
| %out = mul nsw <8 x i16> %in1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> |
| ret <8 x i16> %out |
| } |
| |
| define <8 x i16> @umull_smaller_v8i16(<8 x i4> %src1, <8 x i16> %src2) { |
| ; CHECK-NEON-LABEL: umull_smaller_v8i16: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: movi v2.8b, #15 |
| ; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8 |
| ; CHECK-NEON-NEXT: xtn v1.8b, v1.8h |
| ; CHECK-NEON-NEXT: and v0.8b, v0.8b, v2.8b |
| ; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umull_smaller_v8i16: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: movi v2.8b, #15 |
| ; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8 |
| ; CHECK-SVE-NEXT: xtn v1.8b, v1.8h |
| ; CHECK-SVE-NEXT: and v0.8b, v0.8b, v2.8b |
| ; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umull_smaller_v8i16: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff |
| ; CHECK-GI-NEXT: movi v3.8h, #15 |
| ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 |
| ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b |
| ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b |
| ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %in1 = zext <8 x i4> %src1 to <8 x i16> |
| %in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> |
| %out = mul nsw <8 x i16> %in1, %in2 |
| ret <8 x i16> %out |
| } |
| |
| define <4 x i32> @umull_and_v4i32(<4 x i16> %src1, <4 x i32> %src2) { |
| ; CHECK-LABEL: umull_and_v4i32: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: movi v2.2d, #0x0000ff000000ff |
| ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b |
| ; CHECK-NEXT: xtn v1.4h, v1.4s |
| ; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h |
| ; CHECK-NEXT: ret |
| entry: |
| %in1 = zext <4 x i16> %src1 to <4 x i32> |
| %in2 = and <4 x i32> %src2, <i32 255, i32 255, i32 255, i32 255> |
| %out = mul nsw <4 x i32> %in1, %in2 |
| ret <4 x i32> %out |
| } |
| |
| define <8 x i32> @umull_and_v8i32(<8 x i16> %src1, <8 x i32> %src2) { |
| ; CHECK-NEON-LABEL: umull_and_v8i32: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: movi v3.2d, #0x0000ff000000ff |
| ; CHECK-NEON-NEXT: and v2.16b, v2.16b, v3.16b |
| ; CHECK-NEON-NEXT: and v1.16b, v1.16b, v3.16b |
| ; CHECK-NEON-NEXT: uzp1 v2.8h, v1.8h, v2.8h |
| ; CHECK-NEON-NEXT: umull2 v1.4s, v0.8h, v2.8h |
| ; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v2.4h |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umull_and_v8i32: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: movi v3.2d, #0x0000ff000000ff |
| ; CHECK-SVE-NEXT: and v2.16b, v2.16b, v3.16b |
| ; CHECK-SVE-NEXT: and v1.16b, v1.16b, v3.16b |
| ; CHECK-SVE-NEXT: uzp1 v2.8h, v1.8h, v2.8h |
| ; CHECK-SVE-NEXT: umull2 v1.4s, v0.8h, v2.8h |
| ; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v2.4h |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umull_and_v8i32: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: movi v3.2d, #0x0000ff000000ff |
| ; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b |
| ; CHECK-GI-NEXT: and v2.16b, v2.16b, v3.16b |
| ; CHECK-GI-NEXT: mov d3, v0.d[1] |
| ; CHECK-GI-NEXT: xtn v1.4h, v1.4s |
| ; CHECK-GI-NEXT: xtn v2.4h, v2.4s |
| ; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.4h |
| ; CHECK-GI-NEXT: umull v1.4s, v3.4h, v2.4h |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %in1 = zext <8 x i16> %src1 to <8 x i32> |
| %in2 = and <8 x i32> %src2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> |
| %out = mul nsw <8 x i32> %in1, %in2 |
| ret <8 x i32> %out |
| } |
| |
| define <8 x i32> @umull_and_v8i32_dup(<8 x i16> %src1, i32 %src2) { |
| ; CHECK-NEON-LABEL: umull_and_v8i32_dup: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: and w8, w0, #0xff |
| ; CHECK-NEON-NEXT: dup v2.8h, w8 |
| ; CHECK-NEON-NEXT: umull2 v1.4s, v0.8h, v2.8h |
| ; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v2.4h |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umull_and_v8i32_dup: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: and w8, w0, #0xff |
| ; CHECK-SVE-NEXT: dup v2.8h, w8 |
| ; CHECK-SVE-NEXT: umull2 v1.4s, v0.8h, v2.8h |
| ; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v2.4h |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umull_and_v8i32_dup: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: and w8, w0, #0xff |
| ; CHECK-GI-NEXT: mov d2, v0.d[1] |
| ; CHECK-GI-NEXT: dup v1.4s, w8 |
| ; CHECK-GI-NEXT: xtn v1.4h, v1.4s |
| ; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.4h |
| ; CHECK-GI-NEXT: umull v1.4s, v2.4h, v1.4h |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %in1 = zext <8 x i16> %src1 to <8 x i32> |
| %in2 = and i32 %src2, 255 |
| %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %in2, i64 0 |
| %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer |
| %out = mul nsw <8 x i32> %in1, %broadcast.splat |
| ret <8 x i32> %out |
| } |
| |
| define <2 x i64> @umull_and_v2i64(<2 x i32> %src1, <2 x i64> %src2) { |
| ; CHECK-LABEL: umull_and_v2i64: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: movi v2.2d, #0x000000000000ff |
| ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b |
| ; CHECK-NEXT: xtn v1.2s, v1.2d |
| ; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s |
| ; CHECK-NEXT: ret |
| entry: |
| %in1 = zext <2 x i32> %src1 to <2 x i64> |
| %in2 = and <2 x i64> %src2, <i64 255, i64 255> |
| %out = mul nsw <2 x i64> %in1, %in2 |
| ret <2 x i64> %out |
| } |
| |
| define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) { |
| ; CHECK-NEON-LABEL: umull_and_v4i64: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: movi v3.2d, #0x000000000000ff |
| ; CHECK-NEON-NEXT: and v2.16b, v2.16b, v3.16b |
| ; CHECK-NEON-NEXT: and v1.16b, v1.16b, v3.16b |
| ; CHECK-NEON-NEXT: uzp1 v2.4s, v1.4s, v2.4s |
| ; CHECK-NEON-NEXT: umull2 v1.2d, v0.4s, v2.4s |
| ; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v2.2s |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umull_and_v4i64: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: movi v3.2d, #0x000000000000ff |
| ; CHECK-SVE-NEXT: and v2.16b, v2.16b, v3.16b |
| ; CHECK-SVE-NEXT: and v1.16b, v1.16b, v3.16b |
| ; CHECK-SVE-NEXT: uzp1 v2.4s, v1.4s, v2.4s |
| ; CHECK-SVE-NEXT: umull2 v1.2d, v0.4s, v2.4s |
| ; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v2.2s |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umull_and_v4i64: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: movi v3.2d, #0x000000000000ff |
| ; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b |
| ; CHECK-GI-NEXT: and v2.16b, v2.16b, v3.16b |
| ; CHECK-GI-NEXT: mov d3, v0.d[1] |
| ; CHECK-GI-NEXT: xtn v1.2s, v1.2d |
| ; CHECK-GI-NEXT: xtn v2.2s, v2.2d |
| ; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.2s |
| ; CHECK-GI-NEXT: umull v1.2d, v3.2s, v2.2s |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %in1 = zext <4 x i32> %src1 to <4 x i64> |
| %in2 = and <4 x i64> %src2, <i64 255, i64 255, i64 255, i64 255> |
| %out = mul nsw <4 x i64> %in1, %in2 |
| ret <4 x i64> %out |
| } |
| |
| define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) { |
| ; CHECK-NEON-LABEL: umull_and_v4i64_dup: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: and w8, w0, #0xff |
| ; CHECK-NEON-NEXT: dup v2.4s, w8 |
| ; CHECK-NEON-NEXT: umull2 v1.2d, v0.4s, v2.4s |
| ; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v2.2s |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umull_and_v4i64_dup: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: and w8, w0, #0xff |
| ; CHECK-SVE-NEXT: dup v2.4s, w8 |
| ; CHECK-SVE-NEXT: umull2 v1.2d, v0.4s, v2.4s |
| ; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v2.2s |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umull_and_v4i64_dup: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: and x8, x0, #0xff |
| ; CHECK-GI-NEXT: mov d2, v0.d[1] |
| ; CHECK-GI-NEXT: dup v1.2d, x8 |
| ; CHECK-GI-NEXT: xtn v1.2s, v1.2d |
| ; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.2s |
| ; CHECK-GI-NEXT: umull v1.2d, v2.2s, v1.2s |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %in1 = zext <4 x i32> %src1 to <4 x i64> |
| %in2 = and i64 %src2, 255 |
| %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %in2, i64 0 |
| %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer |
| %out = mul nsw <4 x i64> %in1, %broadcast.splat |
| ret <4 x i64> %out |
| } |
| |
| define void @pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) { |
| ; CHECK-LABEL: pmlsl2_v8i16_uzp1: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldr q2, [x1, #16] |
| ; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b |
| ; CHECK-NEXT: pmull2 v0.8h, v0.16b, v2.16b |
| ; CHECK-NEXT: sub v0.8h, v1.8h, v0.8h |
| ; CHECK-NEXT: str q0, [x0] |
| ; CHECK-NEXT: ret |
| %5 = getelementptr inbounds i32, ptr %3, i64 4 |
| %6 = load <8 x i16>, ptr %5, align 4 |
| %7 = trunc <8 x i16> %6 to <8 x i8> |
| %8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
| %9 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %8, <8 x i8> %7) |
| %10 = sub <8 x i16> %1, %9 |
| store <8 x i16> %10, ptr %2, align 16 |
| ret void |
| } |
| |
| define void @smlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) { |
| ; CHECK-NEON-LABEL: smlsl2_v8i16_uzp1: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: ldr q2, [x1, #16] |
| ; CHECK-NEON-NEXT: uzp1 v2.16b, v0.16b, v2.16b |
| ; CHECK-NEON-NEXT: smlsl2 v1.8h, v0.16b, v2.16b |
| ; CHECK-NEON-NEXT: str q1, [x0] |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: smlsl2_v8i16_uzp1: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: ldr q2, [x1, #16] |
| ; CHECK-SVE-NEXT: uzp1 v2.16b, v0.16b, v2.16b |
| ; CHECK-SVE-NEXT: smlsl2 v1.8h, v0.16b, v2.16b |
| ; CHECK-SVE-NEXT: str q1, [x0] |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: smlsl2_v8i16_uzp1: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: ldr q2, [x1, #16] |
| ; CHECK-GI-NEXT: mov d0, v0.d[1] |
| ; CHECK-GI-NEXT: xtn v2.8b, v2.8h |
| ; CHECK-GI-NEXT: smlsl v1.8h, v0.8b, v2.8b |
| ; CHECK-GI-NEXT: str q1, [x0] |
| ; CHECK-GI-NEXT: ret |
| %5 = getelementptr inbounds i32, ptr %3, i64 4 |
| %6 = load <8 x i16>, ptr %5, align 4 |
| %7 = trunc <8 x i16> %6 to <8 x i8> |
| %8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
| %9 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %8, <8 x i8> %7) |
| %10 = sub <8 x i16> %1, %9 |
| store <8 x i16> %10, ptr %2, align 16 |
| ret void |
| } |
| |
| define void @umlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) { |
| ; CHECK-NEON-LABEL: umlsl2_v8i16_uzp1: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: ldr q2, [x1, #16] |
| ; CHECK-NEON-NEXT: uzp1 v2.16b, v0.16b, v2.16b |
| ; CHECK-NEON-NEXT: umlsl2 v1.8h, v0.16b, v2.16b |
| ; CHECK-NEON-NEXT: str q1, [x0] |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umlsl2_v8i16_uzp1: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: ldr q2, [x1, #16] |
| ; CHECK-SVE-NEXT: uzp1 v2.16b, v0.16b, v2.16b |
| ; CHECK-SVE-NEXT: umlsl2 v1.8h, v0.16b, v2.16b |
| ; CHECK-SVE-NEXT: str q1, [x0] |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umlsl2_v8i16_uzp1: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: ldr q2, [x1, #16] |
| ; CHECK-GI-NEXT: mov d0, v0.d[1] |
| ; CHECK-GI-NEXT: xtn v2.8b, v2.8h |
| ; CHECK-GI-NEXT: umlsl v1.8h, v0.8b, v2.8b |
| ; CHECK-GI-NEXT: str q1, [x0] |
| ; CHECK-GI-NEXT: ret |
| %5 = getelementptr inbounds i32, ptr %3, i64 4 |
| %6 = load <8 x i16>, ptr %5, align 4 |
| %7 = trunc <8 x i16> %6 to <8 x i8> |
| %8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
| %9 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %8, <8 x i8> %7) |
| %10 = sub <8 x i16> %1, %9 |
| store <8 x i16> %10, ptr %2, align 16 |
| ret void |
| } |
| |
| define void @smlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) { |
| ; CHECK-NEON-LABEL: smlsl2_v4i32_uzp1: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: ldr q2, [x1, #16] |
| ; CHECK-NEON-NEXT: uzp1 v2.8h, v0.8h, v2.8h |
| ; CHECK-NEON-NEXT: smlsl2 v1.4s, v0.8h, v2.8h |
| ; CHECK-NEON-NEXT: str q1, [x0] |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: smlsl2_v4i32_uzp1: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: ldr q2, [x1, #16] |
| ; CHECK-SVE-NEXT: uzp1 v2.8h, v0.8h, v2.8h |
| ; CHECK-SVE-NEXT: smlsl2 v1.4s, v0.8h, v2.8h |
| ; CHECK-SVE-NEXT: str q1, [x0] |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: smlsl2_v4i32_uzp1: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: ldr q2, [x1, #16] |
| ; CHECK-GI-NEXT: mov d0, v0.d[1] |
| ; CHECK-GI-NEXT: xtn v2.4h, v2.4s |
| ; CHECK-GI-NEXT: smlsl v1.4s, v0.4h, v2.4h |
| ; CHECK-GI-NEXT: str q1, [x0] |
| ; CHECK-GI-NEXT: ret |
| %5 = getelementptr inbounds i32, ptr %3, i64 4 |
| %6 = load <4 x i32>, ptr %5, align 4 |
| %7 = trunc <4 x i32> %6 to <4 x i16> |
| %8 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> |
| %9 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %8, <4 x i16> %7) |
| %10 = sub <4 x i32> %1, %9 |
| store <4 x i32> %10, ptr %2, align 16 |
| ret void |
| } |
| |
| define void @umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) { |
| ; CHECK-NEON-LABEL: umlsl2_v4i32_uzp1: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: ldr q2, [x1, #16] |
| ; CHECK-NEON-NEXT: uzp1 v2.8h, v0.8h, v2.8h |
| ; CHECK-NEON-NEXT: umlsl2 v1.4s, v0.8h, v2.8h |
| ; CHECK-NEON-NEXT: str q1, [x0] |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umlsl2_v4i32_uzp1: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: ldr q2, [x1, #16] |
| ; CHECK-SVE-NEXT: uzp1 v2.8h, v0.8h, v2.8h |
| ; CHECK-SVE-NEXT: umlsl2 v1.4s, v0.8h, v2.8h |
| ; CHECK-SVE-NEXT: str q1, [x0] |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umlsl2_v4i32_uzp1: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: ldr q2, [x1, #16] |
| ; CHECK-GI-NEXT: mov d0, v0.d[1] |
| ; CHECK-GI-NEXT: xtn v2.4h, v2.4s |
| ; CHECK-GI-NEXT: umlsl v1.4s, v0.4h, v2.4h |
| ; CHECK-GI-NEXT: str q1, [x0] |
| ; CHECK-GI-NEXT: ret |
| %5 = getelementptr inbounds i32, ptr %3, i64 4 |
| %6 = load <4 x i32>, ptr %5, align 4 |
| %7 = trunc <4 x i32> %6 to <4 x i16> |
| %8 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> |
| %9 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %8, <4 x i16> %7) |
| %10 = sub <4 x i32> %1, %9 |
| store <4 x i32> %10, ptr %2, align 16 |
| ret void |
| } |
| |
| define void @pmlsl_pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) { |
| ; CHECK-LABEL: pmlsl_pmlsl2_v8i16_uzp1: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: ldp q2, q3, [x1] |
| ; CHECK-NEXT: uzp1 v2.16b, v2.16b, v3.16b |
| ; CHECK-NEXT: pmull v3.8h, v0.8b, v2.8b |
| ; CHECK-NEXT: pmull2 v0.8h, v0.16b, v2.16b |
| ; CHECK-NEXT: add v0.8h, v3.8h, v0.8h |
| ; CHECK-NEXT: sub v0.8h, v1.8h, v0.8h |
| ; CHECK-NEXT: str q0, [x0] |
| ; CHECK-NEXT: ret |
| entry: |
| %5 = load <8 x i16>, ptr %3, align 4 |
| %6 = trunc <8 x i16> %5 to <8 x i8> |
| %7 = getelementptr inbounds i32, ptr %3, i64 4 |
| %8 = load <8 x i16>, ptr %7, align 4 |
| %9 = trunc <8 x i16> %8 to <8 x i8> |
| %10 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| %11 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %10, <8 x i8> %6) |
| %12 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
| %13 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %12, <8 x i8> %9) |
| %14 = add <8 x i16> %11, %13 |
| %15 = sub <8 x i16> %1, %14 |
| store <8 x i16> %15, ptr %2, align 16 |
| ret void |
| } |
| |
| define void @smlsl_smlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) { |
| ; CHECK-NEON-LABEL: smlsl_smlsl2_v8i16_uzp1: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: ldp q2, q3, [x1] |
| ; CHECK-NEON-NEXT: uzp1 v2.16b, v2.16b, v3.16b |
| ; CHECK-NEON-NEXT: smlsl v1.8h, v0.8b, v2.8b |
| ; CHECK-NEON-NEXT: smlsl2 v1.8h, v0.16b, v2.16b |
| ; CHECK-NEON-NEXT: str q1, [x0] |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: smlsl_smlsl2_v8i16_uzp1: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: ldp q2, q3, [x1] |
| ; CHECK-SVE-NEXT: uzp1 v2.16b, v2.16b, v3.16b |
| ; CHECK-SVE-NEXT: smlsl v1.8h, v0.8b, v2.8b |
| ; CHECK-SVE-NEXT: smlsl2 v1.8h, v0.16b, v2.16b |
| ; CHECK-SVE-NEXT: str q1, [x0] |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: smlsl_smlsl2_v8i16_uzp1: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: ldp q4, q2, [x1] |
| ; CHECK-GI-NEXT: mov d3, v0.d[1] |
| ; CHECK-GI-NEXT: xtn v2.8b, v2.8h |
| ; CHECK-GI-NEXT: xtn v4.8b, v4.8h |
| ; CHECK-GI-NEXT: smull v2.8h, v3.8b, v2.8b |
| ; CHECK-GI-NEXT: smlal v2.8h, v0.8b, v4.8b |
| ; CHECK-GI-NEXT: sub v0.8h, v1.8h, v2.8h |
| ; CHECK-GI-NEXT: str q0, [x0] |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %5 = load <8 x i16>, ptr %3, align 4 |
| %6 = trunc <8 x i16> %5 to <8 x i8> |
| %7 = getelementptr inbounds i32, ptr %3, i64 4 |
| %8 = load <8 x i16>, ptr %7, align 4 |
| %9 = trunc <8 x i16> %8 to <8 x i8> |
| %10 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| %11 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %10, <8 x i8> %6) |
| %12 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
| %13 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %12, <8 x i8> %9) |
| %14 = add <8 x i16> %11, %13 |
| %15 = sub <8 x i16> %1, %14 |
| store <8 x i16> %15, ptr %2, align 16 |
| ret void |
| } |
| |
| define void @umlsl_umlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) { |
| ; CHECK-NEON-LABEL: umlsl_umlsl2_v8i16_uzp1: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: ldp q2, q3, [x1] |
| ; CHECK-NEON-NEXT: uzp1 v2.16b, v2.16b, v3.16b |
| ; CHECK-NEON-NEXT: umlsl v1.8h, v0.8b, v2.8b |
| ; CHECK-NEON-NEXT: umlsl2 v1.8h, v0.16b, v2.16b |
| ; CHECK-NEON-NEXT: str q1, [x0] |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umlsl_umlsl2_v8i16_uzp1: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: ldp q2, q3, [x1] |
| ; CHECK-SVE-NEXT: uzp1 v2.16b, v2.16b, v3.16b |
| ; CHECK-SVE-NEXT: umlsl v1.8h, v0.8b, v2.8b |
| ; CHECK-SVE-NEXT: umlsl2 v1.8h, v0.16b, v2.16b |
| ; CHECK-SVE-NEXT: str q1, [x0] |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umlsl_umlsl2_v8i16_uzp1: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: ldp q4, q2, [x1] |
| ; CHECK-GI-NEXT: mov d3, v0.d[1] |
| ; CHECK-GI-NEXT: xtn v2.8b, v2.8h |
| ; CHECK-GI-NEXT: xtn v4.8b, v4.8h |
| ; CHECK-GI-NEXT: umull v2.8h, v3.8b, v2.8b |
| ; CHECK-GI-NEXT: umlal v2.8h, v0.8b, v4.8b |
| ; CHECK-GI-NEXT: sub v0.8h, v1.8h, v2.8h |
| ; CHECK-GI-NEXT: str q0, [x0] |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %5 = load <8 x i16>, ptr %3, align 4 |
| %6 = trunc <8 x i16> %5 to <8 x i8> |
| %7 = getelementptr inbounds i32, ptr %3, i64 4 |
| %8 = load <8 x i16>, ptr %7, align 4 |
| %9 = trunc <8 x i16> %8 to <8 x i8> |
| %10 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| %11 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %10, <8 x i8> %6) |
| %12 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
| %13 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %12, <8 x i8> %9) |
| %14 = add <8 x i16> %11, %13 |
| %15 = sub <8 x i16> %1, %14 |
| store <8 x i16> %15, ptr %2, align 16 |
| ret void |
| } |
| |
| define void @smlsl_smlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3, i32 %4) { |
| ; CHECK-NEON-LABEL: smlsl_smlsl2_v4i32_uzp1: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: ldp q2, q3, [x1] |
| ; CHECK-NEON-NEXT: uzp1 v2.8h, v2.8h, v3.8h |
| ; CHECK-NEON-NEXT: smlsl v1.4s, v0.4h, v2.4h |
| ; CHECK-NEON-NEXT: smlsl2 v1.4s, v0.8h, v2.8h |
| ; CHECK-NEON-NEXT: str q1, [x0] |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: smlsl_smlsl2_v4i32_uzp1: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: ldp q2, q3, [x1] |
| ; CHECK-SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h |
| ; CHECK-SVE-NEXT: smlsl v1.4s, v0.4h, v2.4h |
| ; CHECK-SVE-NEXT: smlsl2 v1.4s, v0.8h, v2.8h |
| ; CHECK-SVE-NEXT: str q1, [x0] |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: smlsl_smlsl2_v4i32_uzp1: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: ldp q4, q2, [x1] |
| ; CHECK-GI-NEXT: mov d3, v0.d[1] |
| ; CHECK-GI-NEXT: xtn v2.4h, v2.4s |
| ; CHECK-GI-NEXT: xtn v4.4h, v4.4s |
| ; CHECK-GI-NEXT: smull v2.4s, v3.4h, v2.4h |
| ; CHECK-GI-NEXT: smlal v2.4s, v0.4h, v4.4h |
| ; CHECK-GI-NEXT: sub v0.4s, v1.4s, v2.4s |
| ; CHECK-GI-NEXT: str q0, [x0] |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %5 = load <4 x i32>, ptr %3, align 4 |
| %6 = trunc <4 x i32> %5 to <4 x i16> |
| %7 = getelementptr inbounds i32, ptr %3, i64 4 |
| %8 = load <4 x i32>, ptr %7, align 4 |
| %9 = trunc <4 x i32> %8 to <4 x i16> |
| %10 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| %11 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %10, <4 x i16> %6) |
| %12 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> |
| %13 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %12, <4 x i16> %9) |
| %14 = add <4 x i32> %11, %13 |
| %15 = sub <4 x i32> %1, %14 |
| store <4 x i32> %15, ptr %2, align 16 |
| ret void |
| } |
| |
| define void @umlsl_umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3, i32 %4) { |
| ; CHECK-NEON-LABEL: umlsl_umlsl2_v4i32_uzp1: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: ldp q2, q3, [x1] |
| ; CHECK-NEON-NEXT: uzp1 v2.8h, v2.8h, v3.8h |
| ; CHECK-NEON-NEXT: umlsl v1.4s, v0.4h, v2.4h |
| ; CHECK-NEON-NEXT: umlsl2 v1.4s, v0.8h, v2.8h |
| ; CHECK-NEON-NEXT: str q1, [x0] |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umlsl_umlsl2_v4i32_uzp1: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: ldp q2, q3, [x1] |
| ; CHECK-SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h |
| ; CHECK-SVE-NEXT: umlsl v1.4s, v0.4h, v2.4h |
| ; CHECK-SVE-NEXT: umlsl2 v1.4s, v0.8h, v2.8h |
| ; CHECK-SVE-NEXT: str q1, [x0] |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umlsl_umlsl2_v4i32_uzp1: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: ldp q4, q2, [x1] |
| ; CHECK-GI-NEXT: mov d3, v0.d[1] |
| ; CHECK-GI-NEXT: xtn v2.4h, v2.4s |
| ; CHECK-GI-NEXT: xtn v4.4h, v4.4s |
| ; CHECK-GI-NEXT: umull v2.4s, v3.4h, v2.4h |
| ; CHECK-GI-NEXT: umlal v2.4s, v0.4h, v4.4h |
| ; CHECK-GI-NEXT: sub v0.4s, v1.4s, v2.4s |
| ; CHECK-GI-NEXT: str q0, [x0] |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %5 = load <4 x i32>, ptr %3, align 4 |
| %6 = trunc <4 x i32> %5 to <4 x i16> |
| %7 = getelementptr inbounds i32, ptr %3, i64 4 |
| %8 = load <4 x i32>, ptr %7, align 4 |
| %9 = trunc <4 x i32> %8 to <4 x i16> |
| %10 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| %11 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %10, <4 x i16> %6) |
| %12 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> |
| %13 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %12, <4 x i16> %9) |
| %14 = add <4 x i32> %11, %13 |
| %15 = sub <4 x i32> %1, %14 |
| store <4 x i32> %15, ptr %2, align 16 |
| ret void |
| } |
| |
| define <2 x i32> @do_stuff(<2 x i64> %0, <2 x i64> %1) { |
| ; CHECK-NEON-LABEL: do_stuff: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: uzp1 v0.4s, v0.4s, v0.4s |
| ; CHECK-NEON-NEXT: smull2 v0.2d, v1.4s, v0.4s |
| ; CHECK-NEON-NEXT: xtn v0.2s, v0.2d |
| ; CHECK-NEON-NEXT: add v0.2s, v0.2s, v1.2s |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: do_stuff: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: uzp1 v0.4s, v0.4s, v0.4s |
| ; CHECK-SVE-NEXT: smull2 v0.2d, v1.4s, v0.4s |
| ; CHECK-SVE-NEXT: xtn v0.2s, v0.2d |
| ; CHECK-SVE-NEXT: add v0.2s, v0.2s, v1.2s |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: do_stuff: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: xtn v0.2s, v0.2d |
| ; CHECK-GI-NEXT: mov d2, v1.d[1] |
| ; CHECK-GI-NEXT: smull v0.2d, v2.2s, v0.2s |
| ; CHECK-GI-NEXT: xtn v0.2s, v0.2d |
| ; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s |
| ; CHECK-GI-NEXT: ret |
| %bc.1 = bitcast <2 x i64> %1 to <4 x i32> |
| %trunc.0 = trunc <2 x i64> %0 to <2 x i32> |
| %shuff.hi = shufflevector <4 x i32> %bc.1, <4 x i32> zeroinitializer, <2 x i32> <i32 2, i32 3> |
| %shuff.lo = shufflevector <4 x i32> %bc.1, <4 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1> |
| %smull = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuff.hi, <2 x i32> %trunc.0) |
| %trunc.smull = trunc <2 x i64> %smull to <2 x i32> |
| %final = add <2 x i32> %trunc.smull, %shuff.lo |
| ret <2 x i32> %final |
| } |
| |
| define <2 x i64> @lsr(<2 x i64> %a, <2 x i64> %b) { |
| ; CHECK-LABEL: lsr: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: shrn v0.2s, v0.2d, #32 |
| ; CHECK-NEXT: shrn v1.2s, v1.2d, #32 |
| ; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s |
| ; CHECK-NEXT: ret |
| %x = lshr <2 x i64> %a, <i64 32, i64 32> |
| %y = lshr <2 x i64> %b, <i64 32, i64 32> |
| %z = mul nsw <2 x i64> %x, %y |
| ret <2 x i64> %z |
| } |
| |
| define <2 x i64> @lsr_const(<2 x i64> %a, <2 x i64> %b) { |
| ; CHECK-LABEL: lsr_const: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: movi v1.2s, #31 |
| ; CHECK-NEXT: shrn v0.2s, v0.2d, #32 |
| ; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s |
| ; CHECK-NEXT: ret |
| %x = lshr <2 x i64> %a, <i64 32, i64 32> |
| %z = mul nsw <2 x i64> %x, <i64 31, i64 31> |
| ret <2 x i64> %z |
| } |
| |
| define <2 x i64> @asr(<2 x i64> %a, <2 x i64> %b) { |
| ; CHECK-LABEL: asr: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: shrn v0.2s, v0.2d, #32 |
| ; CHECK-NEXT: shrn v1.2s, v1.2d, #32 |
| ; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s |
| ; CHECK-NEXT: ret |
| %x = ashr <2 x i64> %a, <i64 32, i64 32> |
| %y = ashr <2 x i64> %b, <i64 32, i64 32> |
| %z = mul nsw <2 x i64> %x, %y |
| ret <2 x i64> %z |
| } |
| |
| define <2 x i64> @asr_const(<2 x i64> %a, <2 x i64> %b) { |
| ; CHECK-LABEL: asr_const: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: movi v1.2s, #31 |
| ; CHECK-NEXT: shrn v0.2s, v0.2d, #32 |
| ; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s |
| ; CHECK-NEXT: ret |
| %x = ashr <2 x i64> %a, <i64 32, i64 32> |
| %z = mul nsw <2 x i64> %x, <i64 31, i64 31> |
| ret <2 x i64> %z |
| } |
| |
| define <8 x i16> @smulladdl_v8i8_v8i16(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) nounwind { |
| ; CHECK-NEON-LABEL: smulladdl_v8i8_v8i16: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b |
| ; CHECK-NEON-NEXT: saddw v0.8h, v0.8h, v2.8b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: smulladdl_v8i8_v8i16: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b |
| ; CHECK-SVE-NEXT: saddw v0.8h, v0.8h, v2.8b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: smulladdl_v8i8_v8i16: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: sshll v2.8h, v2.8b, #0 |
| ; CHECK-GI-NEXT: smlal v2.8h, v0.8b, v1.8b |
| ; CHECK-GI-NEXT: mov v0.16b, v2.16b |
| ; CHECK-GI-NEXT: ret |
| %tmp1 = sext <8 x i8> %A to <8 x i16> |
| %tmp2 = sext <8 x i8> %B to <8 x i16> |
| %tmp3 = sext <8 x i8> %C to <8 x i16> |
| %tmp4 = mul <8 x i16> %tmp1, %tmp2 |
| %tmp5 = add <8 x i16> %tmp4, %tmp3 |
| ret <8 x i16> %tmp5 |
| } |
| |
| define <8 x i16> @umulladdl_v8i8_v8i16(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) nounwind { |
| ; CHECK-NEON-LABEL: umulladdl_v8i8_v8i16: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b |
| ; CHECK-NEON-NEXT: uaddw v0.8h, v0.8h, v2.8b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umulladdl_v8i8_v8i16: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b |
| ; CHECK-SVE-NEXT: uaddw v0.8h, v0.8h, v2.8b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umulladdl_v8i8_v8i16: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 |
| ; CHECK-GI-NEXT: umlal v2.8h, v0.8b, v1.8b |
| ; CHECK-GI-NEXT: mov v0.16b, v2.16b |
| ; CHECK-GI-NEXT: ret |
| %tmp1 = zext <8 x i8> %A to <8 x i16> |
| %tmp2 = zext <8 x i8> %B to <8 x i16> |
| %tmp3 = zext <8 x i8> %C to <8 x i16> |
| %tmp4 = mul <8 x i16> %tmp1, %tmp2 |
| %tmp5 = add <8 x i16> %tmp4, %tmp3 |
| ret <8 x i16> %tmp5 |
| } |
| |
| define <8 x i16> @smlall_v8i8_v8i16(<8 x i8> %A, <8 x i8> %B, <8 x i16> %C) nounwind { |
| ; CHECK-LABEL: smlall_v8i8_v8i16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: smlal v2.8h, v0.8b, v1.8b |
| ; CHECK-NEXT: mov v0.16b, v2.16b |
| ; CHECK-NEXT: ret |
| %tmp1 = sext <8 x i8> %A to <8 x i16> |
| %tmp2 = sext <8 x i8> %B to <8 x i16> |
| %tmp4 = mul <8 x i16> %tmp1, %tmp2 |
| %tmp5 = add <8 x i16> %tmp4, %C |
| ret <8 x i16> %tmp5 |
| } |
| |
| define <8 x i16> @umlall_v8i8_v8i16(<8 x i8> %A, <8 x i8> %B, <8 x i16> %C) nounwind { |
| ; CHECK-LABEL: umlall_v8i8_v8i16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: umlal v2.8h, v0.8b, v1.8b |
| ; CHECK-NEXT: mov v0.16b, v2.16b |
| ; CHECK-NEXT: ret |
| %tmp1 = zext <8 x i8> %A to <8 x i16> |
| %tmp2 = zext <8 x i8> %B to <8 x i16> |
| %tmp4 = mul <8 x i16> %tmp1, %tmp2 |
| %tmp5 = add <8 x i16> %tmp4, %C |
| ret <8 x i16> %tmp5 |
| } |
| |
| define <8 x i16> @smulladdl_const_v8i8_v8i16(<8 x i8> %A, <8 x i8> %C) nounwind { |
| ; CHECK-NEON-LABEL: smulladdl_const_v8i8_v8i16: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: movi v2.8b, #10 |
| ; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v2.8b |
| ; CHECK-NEON-NEXT: saddw v0.8h, v0.8h, v1.8b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: smulladdl_const_v8i8_v8i16: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: movi v2.8b, #10 |
| ; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v2.8b |
| ; CHECK-SVE-NEXT: saddw v0.8h, v0.8h, v1.8b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: smulladdl_const_v8i8_v8i16: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: movi v2.8b, #10 |
| ; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 |
| ; CHECK-GI-NEXT: smlal v1.8h, v0.8b, v2.8b |
| ; CHECK-GI-NEXT: mov v0.16b, v1.16b |
| ; CHECK-GI-NEXT: ret |
| %tmp1 = sext <8 x i8> %A to <8 x i16> |
| %tmp3 = sext <8 x i8> %C to <8 x i16> |
| %tmp4 = mul <8 x i16> %tmp1, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10> |
| %tmp5 = add <8 x i16> %tmp4, %tmp3 |
| ret <8 x i16> %tmp5 |
| } |
| |
| define <8 x i16> @umulladdl_const_v8i8_v8i16(<8 x i8> %A, <8 x i8> %C) nounwind { |
| ; CHECK-NEON-LABEL: umulladdl_const_v8i8_v8i16: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: movi v2.8b, #10 |
| ; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v2.8b |
| ; CHECK-NEON-NEXT: uaddw v0.8h, v0.8h, v1.8b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umulladdl_const_v8i8_v8i16: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: movi v2.8b, #10 |
| ; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v2.8b |
| ; CHECK-SVE-NEXT: uaddw v0.8h, v0.8h, v1.8b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umulladdl_const_v8i8_v8i16: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: movi v2.8b, #10 |
| ; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 |
| ; CHECK-GI-NEXT: umlal v1.8h, v0.8b, v2.8b |
| ; CHECK-GI-NEXT: mov v0.16b, v1.16b |
| ; CHECK-GI-NEXT: ret |
| %tmp1 = zext <8 x i8> %A to <8 x i16> |
| %tmp3 = zext <8 x i8> %C to <8 x i16> |
| %tmp4 = mul <8 x i16> %tmp1, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10> |
| %tmp5 = add <8 x i16> %tmp4, %tmp3 |
| ret <8 x i16> %tmp5 |
| } |
| |
| define <8 x i16> @sdistribute_v8i8(<8 x i8> %src1, <8 x i8> %src2, <8 x i8> %mul) { |
| ; CHECK-NEON-LABEL: sdistribute_v8i8: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v2.8b |
| ; CHECK-NEON-NEXT: smlal v0.8h, v1.8b, v2.8b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: sdistribute_v8i8: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v2.8b |
| ; CHECK-SVE-NEXT: smlal v0.8h, v1.8b, v2.8b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: sdistribute_v8i8: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: sshll v2.8h, v2.8b, #0 |
| ; CHECK-GI-NEXT: saddl v0.8h, v0.8b, v1.8b |
| ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v2.8h |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %4 = sext <8 x i8> %src1 to <8 x i16> |
| %5 = sext <8 x i8> %mul to <8 x i16> |
| %7 = sext <8 x i8> %src2 to <8 x i16> |
| %8 = add nuw nsw <8 x i16> %4, %7 |
| %9 = mul <8 x i16> %8, %5 |
| ret <8 x i16> %9 |
| } |
| |
| define <8 x i16> @sdistribute_const1_v8i8(<8 x i8> %src1, <8 x i8> %mul) { |
| ; CHECK-NEON-LABEL: sdistribute_const1_v8i8: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: movi v2.8b, #10 |
| ; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b |
| ; CHECK-NEON-NEXT: smlal v0.8h, v2.8b, v1.8b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: sdistribute_const1_v8i8: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: movi v2.8b, #10 |
| ; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b |
| ; CHECK-SVE-NEXT: smlal v0.8h, v2.8b, v1.8b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: sdistribute_const1_v8i8: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: movi v2.8h, #10 |
| ; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 |
| ; CHECK-GI-NEXT: saddw v0.8h, v2.8h, v0.8b |
| ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %4 = sext <8 x i8> %src1 to <8 x i16> |
| %5 = sext <8 x i8> %mul to <8 x i16> |
| %8 = add nuw nsw <8 x i16> %4, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10> |
| %9 = mul <8 x i16> %8, %5 |
| ret <8 x i16> %9 |
| } |
| |
| define <8 x i16> @sdistribute_const2_v8i8(<8 x i8> %src1, <8 x i8> %src2) { |
| ; CHECK-NEON-LABEL: sdistribute_const2_v8i8: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: movi v2.8b, #10 |
| ; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v2.8b |
| ; CHECK-NEON-NEXT: smlal v0.8h, v1.8b, v2.8b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: sdistribute_const2_v8i8: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: movi v2.8b, #10 |
| ; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v2.8b |
| ; CHECK-SVE-NEXT: smlal v0.8h, v1.8b, v2.8b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: sdistribute_const2_v8i8: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: movi v2.8h, #10 |
| ; CHECK-GI-NEXT: saddl v0.8h, v0.8b, v1.8b |
| ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v2.8h |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %4 = sext <8 x i8> %src1 to <8 x i16> |
| %5 = sext <8 x i8> %src2 to <8 x i16> |
| %8 = add nuw nsw <8 x i16> %4, %5 |
| %9 = mul <8 x i16> %8, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10> |
| ret <8 x i16> %9 |
| } |
| |
| define <8 x i16> @udistribute_v8i8(<8 x i8> %src1, <8 x i8> %src2, <8 x i8> %mul) { |
| ; CHECK-NEON-LABEL: udistribute_v8i8: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v2.8b |
| ; CHECK-NEON-NEXT: umlal v0.8h, v1.8b, v2.8b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: udistribute_v8i8: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v2.8b |
| ; CHECK-SVE-NEXT: umlal v0.8h, v1.8b, v2.8b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: udistribute_v8i8: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 |
| ; CHECK-GI-NEXT: uaddl v0.8h, v0.8b, v1.8b |
| ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v2.8h |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %4 = zext <8 x i8> %src1 to <8 x i16> |
| %5 = zext <8 x i8> %mul to <8 x i16> |
| %7 = zext <8 x i8> %src2 to <8 x i16> |
| %8 = add nuw nsw <8 x i16> %4, %7 |
| %9 = mul <8 x i16> %8, %5 |
| ret <8 x i16> %9 |
| } |
| |
| define <8 x i16> @udistribute_const1_v8i8(<8 x i8> %src1, <8 x i8> %mul) { |
| ; CHECK-NEON-LABEL: udistribute_const1_v8i8: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: movi v2.8b, #10 |
| ; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b |
| ; CHECK-NEON-NEXT: umlal v0.8h, v2.8b, v1.8b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: udistribute_const1_v8i8: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: movi v2.8b, #10 |
| ; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b |
| ; CHECK-SVE-NEXT: umlal v0.8h, v2.8b, v1.8b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: udistribute_const1_v8i8: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: movi v2.8h, #10 |
| ; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 |
| ; CHECK-GI-NEXT: uaddw v0.8h, v2.8h, v0.8b |
| ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %4 = zext <8 x i8> %src1 to <8 x i16> |
| %5 = zext <8 x i8> %mul to <8 x i16> |
| %8 = add nuw nsw <8 x i16> %4, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10> |
| %9 = mul <8 x i16> %8, %5 |
| ret <8 x i16> %9 |
| } |
| |
| define <8 x i16> @udistribute_const2_v8i8(<8 x i8> %src1, <8 x i8> %src2) { |
| ; CHECK-NEON-LABEL: udistribute_const2_v8i8: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: movi v2.8b, #10 |
| ; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v2.8b |
| ; CHECK-NEON-NEXT: umlal v0.8h, v1.8b, v2.8b |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: udistribute_const2_v8i8: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: movi v2.8b, #10 |
| ; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v2.8b |
| ; CHECK-SVE-NEXT: umlal v0.8h, v1.8b, v2.8b |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: udistribute_const2_v8i8: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: movi v2.8h, #10 |
| ; CHECK-GI-NEXT: uaddl v0.8h, v0.8b, v1.8b |
| ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v2.8h |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %4 = zext <8 x i8> %src1 to <8 x i16> |
| %5 = zext <8 x i8> %src2 to <8 x i16> |
| %8 = add nuw nsw <8 x i16> %4, %5 |
| %9 = mul <8 x i16> %8, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10> |
| ret <8 x i16> %9 |
| } |
| |
| |
| define <2 x i64> @smulladdl_v2i32_v2i64(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) nounwind { |
| ; CHECK-NEON-LABEL: smulladdl_v2i32_v2i64: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s |
| ; CHECK-NEON-NEXT: saddw v0.2d, v0.2d, v2.2s |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: smulladdl_v2i32_v2i64: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s |
| ; CHECK-SVE-NEXT: saddw v0.2d, v0.2d, v2.2s |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: smulladdl_v2i32_v2i64: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: sshll v2.2d, v2.2s, #0 |
| ; CHECK-GI-NEXT: smlal v2.2d, v0.2s, v1.2s |
| ; CHECK-GI-NEXT: mov v0.16b, v2.16b |
| ; CHECK-GI-NEXT: ret |
| %tmp1 = sext <2 x i32> %A to <2 x i64> |
| %tmp2 = sext <2 x i32> %B to <2 x i64> |
| %tmp3 = sext <2 x i32> %C to <2 x i64> |
| %tmp4 = mul <2 x i64> %tmp1, %tmp2 |
| %tmp5 = add <2 x i64> %tmp4, %tmp3 |
| ret <2 x i64> %tmp5 |
| } |
| |
| define <2 x i64> @umulladdl_v2i32_v2i64(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) nounwind { |
| ; CHECK-NEON-LABEL: umulladdl_v2i32_v2i64: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s |
| ; CHECK-NEON-NEXT: uaddw v0.2d, v0.2d, v2.2s |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umulladdl_v2i32_v2i64: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s |
| ; CHECK-SVE-NEXT: uaddw v0.2d, v0.2d, v2.2s |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umulladdl_v2i32_v2i64: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: ushll v2.2d, v2.2s, #0 |
| ; CHECK-GI-NEXT: umlal v2.2d, v0.2s, v1.2s |
| ; CHECK-GI-NEXT: mov v0.16b, v2.16b |
| ; CHECK-GI-NEXT: ret |
| %tmp1 = zext <2 x i32> %A to <2 x i64> |
| %tmp2 = zext <2 x i32> %B to <2 x i64> |
| %tmp3 = zext <2 x i32> %C to <2 x i64> |
| %tmp4 = mul <2 x i64> %tmp1, %tmp2 |
| %tmp5 = add <2 x i64> %tmp4, %tmp3 |
| ret <2 x i64> %tmp5 |
| } |
| |
| define <2 x i64> @smlall_v2i32_v2i64(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind { |
| ; CHECK-LABEL: smlall_v2i32_v2i64: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: smlal v2.2d, v0.2s, v1.2s |
| ; CHECK-NEXT: mov v0.16b, v2.16b |
| ; CHECK-NEXT: ret |
| %tmp1 = sext <2 x i32> %A to <2 x i64> |
| %tmp2 = sext <2 x i32> %B to <2 x i64> |
| %tmp4 = mul <2 x i64> %tmp1, %tmp2 |
| %tmp5 = add <2 x i64> %tmp4, %C |
| ret <2 x i64> %tmp5 |
| } |
| |
| define <2 x i64> @umlall_v2i32_v2i64(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind { |
| ; CHECK-LABEL: umlall_v2i32_v2i64: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: umlal v2.2d, v0.2s, v1.2s |
| ; CHECK-NEXT: mov v0.16b, v2.16b |
| ; CHECK-NEXT: ret |
| %tmp1 = zext <2 x i32> %A to <2 x i64> |
| %tmp2 = zext <2 x i32> %B to <2 x i64> |
| %tmp4 = mul <2 x i64> %tmp1, %tmp2 |
| %tmp5 = add <2 x i64> %tmp4, %C |
| ret <2 x i64> %tmp5 |
| } |
| |
| define <2 x i64> @smulladdl_const_v2i32_v2i64(<2 x i32> %A, <2 x i32> %C) nounwind { |
| ; CHECK-NEON-LABEL: smulladdl_const_v2i32_v2i64: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: movi v2.2s, #10 |
| ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v2.2s |
| ; CHECK-NEON-NEXT: saddw v0.2d, v0.2d, v1.2s |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: smulladdl_const_v2i32_v2i64: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: movi v2.2s, #10 |
| ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v2.2s |
| ; CHECK-SVE-NEXT: saddw v0.2d, v0.2d, v1.2s |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: smulladdl_const_v2i32_v2i64: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: movi v2.2s, #10 |
| ; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0 |
| ; CHECK-GI-NEXT: smlal v1.2d, v0.2s, v2.2s |
| ; CHECK-GI-NEXT: mov v0.16b, v1.16b |
| ; CHECK-GI-NEXT: ret |
| %tmp1 = sext <2 x i32> %A to <2 x i64> |
| %tmp3 = sext <2 x i32> %C to <2 x i64> |
| %tmp4 = mul <2 x i64> %tmp1, <i64 10, i64 10> |
| %tmp5 = add <2 x i64> %tmp4, %tmp3 |
| ret <2 x i64> %tmp5 |
| } |
| |
| define <2 x i64> @umulladdl_const_v2i32_v2i64(<2 x i32> %A, <2 x i32> %C) nounwind { |
| ; CHECK-NEON-LABEL: umulladdl_const_v2i32_v2i64: |
| ; CHECK-NEON: // %bb.0: |
| ; CHECK-NEON-NEXT: movi v2.2s, #10 |
| ; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v2.2s |
| ; CHECK-NEON-NEXT: uaddw v0.2d, v0.2d, v1.2s |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: umulladdl_const_v2i32_v2i64: |
| ; CHECK-SVE: // %bb.0: |
| ; CHECK-SVE-NEXT: movi v2.2s, #10 |
| ; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v2.2s |
| ; CHECK-SVE-NEXT: uaddw v0.2d, v0.2d, v1.2s |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: umulladdl_const_v2i32_v2i64: |
| ; CHECK-GI: // %bb.0: |
| ; CHECK-GI-NEXT: movi v2.2s, #10 |
| ; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0 |
| ; CHECK-GI-NEXT: umlal v1.2d, v0.2s, v2.2s |
| ; CHECK-GI-NEXT: mov v0.16b, v1.16b |
| ; CHECK-GI-NEXT: ret |
| %tmp1 = zext <2 x i32> %A to <2 x i64> |
| %tmp3 = zext <2 x i32> %C to <2 x i64> |
| %tmp4 = mul <2 x i64> %tmp1, <i64 10, i64 10> |
| %tmp5 = add <2 x i64> %tmp4, %tmp3 |
| ret <2 x i64> %tmp5 |
| } |
| |
| define <2 x i64> @sdistribute_v2i32(<2 x i32> %src1, <2 x i32> %src2, <2 x i32> %mul) { |
| ; CHECK-NEON-LABEL: sdistribute_v2i32: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v2.2s |
| ; CHECK-NEON-NEXT: smlal v0.2d, v1.2s, v2.2s |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: sdistribute_v2i32: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v2.2s |
| ; CHECK-SVE-NEXT: smlal v0.2d, v1.2s, v2.2s |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: sdistribute_v2i32: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: sshll v2.2d, v2.2s, #0 |
| ; CHECK-GI-NEXT: saddl v0.2d, v0.2s, v1.2s |
| ; CHECK-GI-NEXT: fmov x10, d0 |
| ; CHECK-GI-NEXT: fmov x11, d2 |
| ; CHECK-GI-NEXT: mov x8, v0.d[1] |
| ; CHECK-GI-NEXT: mov x9, v2.d[1] |
| ; CHECK-GI-NEXT: mul x10, x10, x11 |
| ; CHECK-GI-NEXT: mul x8, x8, x9 |
| ; CHECK-GI-NEXT: fmov d0, x10 |
| ; CHECK-GI-NEXT: mov v0.d[1], x8 |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %4 = sext <2 x i32> %src1 to <2 x i64> |
| %5 = sext <2 x i32> %mul to <2 x i64> |
| %7 = sext <2 x i32> %src2 to <2 x i64> |
| %8 = add nuw nsw <2 x i64> %4, %7 |
| %9 = mul <2 x i64> %8, %5 |
| ret <2 x i64> %9 |
| } |
| |
| define <2 x i64> @sdistribute_const1_v2i32(<2 x i32> %src1, <2 x i32> %mul) { |
| ; CHECK-NEON-LABEL: sdistribute_const1_v2i32: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: movi v2.2s, #10 |
| ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s |
| ; CHECK-NEON-NEXT: smlal v0.2d, v2.2s, v1.2s |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: sdistribute_const1_v2i32: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: movi v2.2s, #10 |
| ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s |
| ; CHECK-SVE-NEXT: smlal v0.2d, v2.2s, v1.2s |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: sdistribute_const1_v2i32: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: adrp x8, .LCPI101_0 |
| ; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0 |
| ; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI101_0] |
| ; CHECK-GI-NEXT: saddw v0.2d, v2.2d, v0.2s |
| ; CHECK-GI-NEXT: fmov x11, d1 |
| ; CHECK-GI-NEXT: mov x9, v1.d[1] |
| ; CHECK-GI-NEXT: fmov x10, d0 |
| ; CHECK-GI-NEXT: mov x8, v0.d[1] |
| ; CHECK-GI-NEXT: mul x10, x10, x11 |
| ; CHECK-GI-NEXT: mul x8, x8, x9 |
| ; CHECK-GI-NEXT: fmov d0, x10 |
| ; CHECK-GI-NEXT: mov v0.d[1], x8 |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %4 = sext <2 x i32> %src1 to <2 x i64> |
| %5 = sext <2 x i32> %mul to <2 x i64> |
| %8 = add nuw nsw <2 x i64> %4, <i64 10, i64 10> |
| %9 = mul <2 x i64> %8, %5 |
| ret <2 x i64> %9 |
| } |
| |
| define <2 x i64> @sdistribute_const2_v2i32(<2 x i32> %src1, <2 x i32> %src2) { |
| ; CHECK-NEON-LABEL: sdistribute_const2_v2i32: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: movi v2.2s, #10 |
| ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v2.2s |
| ; CHECK-NEON-NEXT: smlal v0.2d, v1.2s, v2.2s |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: sdistribute_const2_v2i32: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: movi v2.2s, #10 |
| ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v2.2s |
| ; CHECK-SVE-NEXT: smlal v0.2d, v1.2s, v2.2s |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: sdistribute_const2_v2i32: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: adrp x8, .LCPI102_0 |
| ; CHECK-GI-NEXT: saddl v0.2d, v0.2s, v1.2s |
| ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI102_0] |
| ; CHECK-GI-NEXT: fmov x10, d0 |
| ; CHECK-GI-NEXT: fmov x11, d1 |
| ; CHECK-GI-NEXT: mov x8, v0.d[1] |
| ; CHECK-GI-NEXT: mov x9, v1.d[1] |
| ; CHECK-GI-NEXT: mul x10, x10, x11 |
| ; CHECK-GI-NEXT: mul x8, x8, x9 |
| ; CHECK-GI-NEXT: fmov d0, x10 |
| ; CHECK-GI-NEXT: mov v0.d[1], x8 |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %4 = sext <2 x i32> %src1 to <2 x i64> |
| %5 = sext <2 x i32> %src2 to <2 x i64> |
| %8 = add nuw nsw <2 x i64> %4, %5 |
| %9 = mul <2 x i64> %8, <i64 10, i64 10> |
| ret <2 x i64> %9 |
| } |
| |
| define <2 x i64> @udistribute_v2i32(<2 x i32> %src1, <2 x i32> %src2, <2 x i32> %mul) { |
| ; CHECK-NEON-LABEL: udistribute_v2i32: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v2.2s |
| ; CHECK-NEON-NEXT: umlal v0.2d, v1.2s, v2.2s |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: udistribute_v2i32: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v2.2s |
| ; CHECK-SVE-NEXT: umlal v0.2d, v1.2s, v2.2s |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: udistribute_v2i32: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: ushll v2.2d, v2.2s, #0 |
| ; CHECK-GI-NEXT: uaddl v0.2d, v0.2s, v1.2s |
| ; CHECK-GI-NEXT: fmov x10, d0 |
| ; CHECK-GI-NEXT: fmov x11, d2 |
| ; CHECK-GI-NEXT: mov x8, v0.d[1] |
| ; CHECK-GI-NEXT: mov x9, v2.d[1] |
| ; CHECK-GI-NEXT: mul x10, x10, x11 |
| ; CHECK-GI-NEXT: mul x8, x8, x9 |
| ; CHECK-GI-NEXT: fmov d0, x10 |
| ; CHECK-GI-NEXT: mov v0.d[1], x8 |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %4 = zext <2 x i32> %src1 to <2 x i64> |
| %5 = zext <2 x i32> %mul to <2 x i64> |
| %7 = zext <2 x i32> %src2 to <2 x i64> |
| %8 = add nuw nsw <2 x i64> %4, %7 |
| %9 = mul <2 x i64> %8, %5 |
| ret <2 x i64> %9 |
| } |
| |
| define <2 x i64> @udistribute_const1_v2i32(<2 x i32> %src1, <2 x i32> %mul) { |
| ; CHECK-NEON-LABEL: udistribute_const1_v2i32: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: movi v2.2s, #10 |
| ; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s |
| ; CHECK-NEON-NEXT: umlal v0.2d, v2.2s, v1.2s |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: udistribute_const1_v2i32: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: movi v2.2s, #10 |
| ; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s |
| ; CHECK-SVE-NEXT: umlal v0.2d, v2.2s, v1.2s |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: udistribute_const1_v2i32: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: adrp x8, .LCPI104_0 |
| ; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0 |
| ; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI104_0] |
| ; CHECK-GI-NEXT: uaddw v0.2d, v2.2d, v0.2s |
| ; CHECK-GI-NEXT: fmov x11, d1 |
| ; CHECK-GI-NEXT: mov x9, v1.d[1] |
| ; CHECK-GI-NEXT: fmov x10, d0 |
| ; CHECK-GI-NEXT: mov x8, v0.d[1] |
| ; CHECK-GI-NEXT: mul x10, x10, x11 |
| ; CHECK-GI-NEXT: mul x8, x8, x9 |
| ; CHECK-GI-NEXT: fmov d0, x10 |
| ; CHECK-GI-NEXT: mov v0.d[1], x8 |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %4 = zext <2 x i32> %src1 to <2 x i64> |
| %5 = zext <2 x i32> %mul to <2 x i64> |
| %8 = add nuw nsw <2 x i64> %4, <i64 10, i64 10> |
| %9 = mul <2 x i64> %8, %5 |
| ret <2 x i64> %9 |
| } |
| |
| define <2 x i64> @udistribute_const2_v2i32(<2 x i32> %src1, <2 x i32> %src2) { |
| ; CHECK-NEON-LABEL: udistribute_const2_v2i32: |
| ; CHECK-NEON: // %bb.0: // %entry |
| ; CHECK-NEON-NEXT: movi v2.2s, #10 |
| ; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v2.2s |
| ; CHECK-NEON-NEXT: umlal v0.2d, v1.2s, v2.2s |
| ; CHECK-NEON-NEXT: ret |
| ; |
| ; CHECK-SVE-LABEL: udistribute_const2_v2i32: |
| ; CHECK-SVE: // %bb.0: // %entry |
| ; CHECK-SVE-NEXT: movi v2.2s, #10 |
| ; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v2.2s |
| ; CHECK-SVE-NEXT: umlal v0.2d, v1.2s, v2.2s |
| ; CHECK-SVE-NEXT: ret |
| ; |
| ; CHECK-GI-LABEL: udistribute_const2_v2i32: |
| ; CHECK-GI: // %bb.0: // %entry |
| ; CHECK-GI-NEXT: adrp x8, .LCPI105_0 |
| ; CHECK-GI-NEXT: uaddl v0.2d, v0.2s, v1.2s |
| ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI105_0] |
| ; CHECK-GI-NEXT: fmov x10, d0 |
| ; CHECK-GI-NEXT: fmov x11, d1 |
| ; CHECK-GI-NEXT: mov x8, v0.d[1] |
| ; CHECK-GI-NEXT: mov x9, v1.d[1] |
| ; CHECK-GI-NEXT: mul x10, x10, x11 |
| ; CHECK-GI-NEXT: mul x8, x8, x9 |
| ; CHECK-GI-NEXT: fmov d0, x10 |
| ; CHECK-GI-NEXT: mov v0.d[1], x8 |
| ; CHECK-GI-NEXT: ret |
| entry: |
| %4 = zext <2 x i32> %src1 to <2 x i64> |
| %5 = zext <2 x i32> %src2 to <2 x i64> |
| %8 = add nuw nsw <2 x i64> %4, %5 |
| %9 = mul <2 x i64> %8, <i64 10, i64 10> |
| ret <2 x i64> %9 |
| } |
| |
| declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) |
| declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) |
| declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) |
| declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) |
| declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) |
| declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) |