| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+aes -o - %s| FileCheck %s --check-prefixes=CHECK |
| |
| ; User code intends to execute {pmull, pmull2} instructions on {lower, higher} half of the same vector registers directly. |
| ; Test that PMULL2 are generated for higher-half operands. |
| ; The suboptimal code generation fails to use higher-half contents in place; instead, it moves higher-lane contents to lower lane |
| ; to make use of PMULL everywhere, and generates unnecessary moves. |
| define void @test1(ptr %0, ptr %1) { |
| ; CHECK-LABEL: test1: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, #56824 // =0xddf8 |
| ; CHECK-NEXT: mov w9, #61186 // =0xef02 |
| ; CHECK-NEXT: movk w8, #40522, lsl #16 |
| ; CHECK-NEXT: movk w9, #29710, lsl #16 |
| ; CHECK-NEXT: ldp q0, q1, [x1] |
| ; CHECK-NEXT: dup v2.2d, x8 |
| ; CHECK-NEXT: fmov d3, x9 |
| ; CHECK-NEXT: pmull v4.1q, v0.1d, v3.1d |
| ; CHECK-NEXT: pmull v3.1q, v1.1d, v3.1d |
| ; CHECK-NEXT: pmull2 v0.1q, v0.2d, v2.2d |
| ; CHECK-NEXT: pmull2 v1.1q, v1.2d, v2.2d |
| ; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b |
| ; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b |
| ; CHECK-NEXT: stp q0, q1, [x1] |
| ; CHECK-NEXT: ret |
| %3 = load <2 x i64>, ptr %1 |
| %4 = getelementptr inbounds <2 x i64>, ptr %1, i64 1 |
| %5 = load <2 x i64>, ptr %4 |
| %6 = extractelement <2 x i64> %3, i64 1 |
| %7 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %6, i64 2655706616) |
| %8 = extractelement <2 x i64> %5, i64 1 |
| %9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %8, i64 2655706616) |
| %10 = load <2 x i64>, ptr %0 |
| %11 = getelementptr inbounds i8, ptr %0, i64 16 |
| %12 = load <2 x i64>, ptr %11 |
| %13 = extractelement <2 x i64> %3, i64 0 |
| %14 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %13, i64 1947135746) |
| %15 = extractelement <2 x i64> %5, i64 0 |
| %16 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %15, i64 1947135746) |
| %17 = xor <16 x i8> %14, %7 |
| %18 = xor <16 x i8> %16, %9 |
| store <16 x i8> %17, ptr %1 |
| store <16 x i8> %18, ptr %4 |
| ret void |
| } |
| |
| ; One operand is higher-half of SIMD register, and the other operand is lower-half of another SIMD register. |
| ; Tests that codegen doesn't generate unnecessary moves. |
| define void @test2(ptr %0, <2 x i64> %1, <2 x i64> %2) { |
| ; CHECK-LABEL: test2: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: dup v1.2d, v1.d[0] |
| ; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d |
| ; CHECK-NEXT: str q0, [x0] |
| ; CHECK-NEXT: ret |
| %4 = extractelement <2 x i64> %1, i64 1 |
| %5 = extractelement <2 x i64> %2, i64 0 |
| %6 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %4, i64 %5) |
| store <16 x i8> %6, ptr %0, align 16 |
| ret void |
| } |
| |
| ; Operand %4 is the higher-half of v2i64, and operand %2 is an input parameter of i64. |
| ; Test that %2 is duplicated into the proper lane of SIMD directly for optimal codegen. |
| define void @test3(ptr %0, <2 x i64> %1, i64 %2) { |
| ; CHECK-LABEL: test3: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: dup v1.2d, x1 |
| ; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d |
| ; CHECK-NEXT: str q0, [x0] |
| ; CHECK-NEXT: ret |
| %4 = extractelement <2 x i64> %1, i64 1 |
| %5 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %4, i64 %2) |
| store <16 x i8> %5, ptr %0, align 16 |
| ret void |
| } |
| |
| declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) |