| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | 
 | ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-LE | 
 | ; RUN: llc -mtriple=aarch64_be-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-BE | 
 |  | 
 | declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) | 
 | declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) | 
 | declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) | 
 | declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) | 
 | declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) | 
 | declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) | 
 | declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) | 
 | declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %s1, <2 x i32> %s2) | 
 |  | 
 | define <4 x i32> @test_smull_high_s16_base(<8 x i16> %a, <8 x i16> %b) #0 { | 
 | ; CHECK-LE-LABEL: test_smull_high_s16_base: | 
 | ; CHECK-LE:       // %bb.0: // %entry | 
 | ; CHECK-LE-NEXT:    smull2 v0.4s, v0.8h, v1.8h | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: test_smull_high_s16_base: | 
 | ; CHECK-BE:       // %bb.0: // %entry | 
 | ; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h | 
 | ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    smull2 v0.4s, v0.8h, v1.8h | 
 | ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 | entry: | 
 |   %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
 |   %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
 |   %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) | 
 |   ret <4 x i32> %r | 
 | } | 
 |  | 
 | define <4 x i32> @test_smull_high_s16_bitcasta1(<2 x i64> %aa, <8 x i16> %b) #0 { | 
 | ; CHECK-LE-LABEL: test_smull_high_s16_bitcasta1: | 
 | ; CHECK-LE:       // %bb.0: // %entry | 
 | ; CHECK-LE-NEXT:    smull2 v0.4s, v0.8h, v1.8h | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: test_smull_high_s16_bitcasta1: | 
 | ; CHECK-BE:       // %bb.0: // %entry | 
 | ; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h | 
 | ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    smull2 v0.4s, v0.8h, v1.8h | 
 | ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 | entry: | 
 |   %a = bitcast <2 x i64> %aa to <8 x i16> | 
 |   %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
 |   %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
 |   %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) | 
 |   ret <4 x i32> %r | 
 | } | 
 |  | 
 | define <4 x i32> @test_smull_high_s16_bitcastb1(<8 x i16> %a, <16 x i8> %bb) #0 { | 
 | ; CHECK-LE-LABEL: test_smull_high_s16_bitcastb1: | 
 | ; CHECK-LE:       // %bb.0: // %entry | 
 | ; CHECK-LE-NEXT:    smull2 v0.4s, v0.8h, v1.8h | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: test_smull_high_s16_bitcastb1: | 
 | ; CHECK-BE:       // %bb.0: // %entry | 
 | ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h | 
 | ; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    smull2 v0.4s, v0.8h, v1.8h | 
 | ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 | entry: | 
 |   %b = bitcast <16 x i8> %bb to <8 x i16> | 
 |   %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
 |   %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
 |   %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) | 
 |   ret <4 x i32> %r | 
 | } | 
 |  | 
 | define <4 x i32> @test_smull_high_s16_bitcasta2(<2 x i64> %a, <8 x i16> %b) #0 { | 
 | ; CHECK-LE-LABEL: test_smull_high_s16_bitcasta2: | 
 | ; CHECK-LE:       // %bb.0: // %entry | 
 | ; CHECK-LE-NEXT:    smull2 v0.4s, v0.8h, v1.8h | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: test_smull_high_s16_bitcasta2: | 
 | ; CHECK-BE:       // %bb.0: // %entry | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    rev64 v0.4h, v0.4h | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    smull v0.4s, v0.4h, v1.4h | 
 | ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 | entry: | 
 |   %s1a = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1> | 
 |   %s1 = bitcast <1 x i64> %s1a to <4 x i16> | 
 |   %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
 |   %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) | 
 |   ret <4 x i32> %r | 
 | } | 
 |  | 
 | define <4 x i32> @test_smull_high_s16_bitcastb2(<8 x i16> %a, <16 x i8> %b) #0 { | 
 | ; CHECK-LE-LABEL: test_smull_high_s16_bitcastb2: | 
 | ; CHECK-LE:       // %bb.0: // %entry | 
 | ; CHECK-LE-NEXT:    smull2 v0.4s, v0.8h, v1.8h | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: test_smull_high_s16_bitcastb2: | 
 | ; CHECK-BE:       // %bb.0: // %entry | 
 | ; CHECK-BE-NEXT:    rev64 v1.16b, v1.16b | 
 | ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    rev16 v1.8b, v1.8b | 
 | ; CHECK-BE-NEXT:    smull v0.4s, v0.4h, v1.4h | 
 | ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 | entry: | 
 |   %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
 |   %s2a = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> | 
 |   %s2 = bitcast <8 x i8> %s2a to <4 x i16> | 
 |   %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) | 
 |   ret <4 x i32> %r | 
 | } | 
 |  | 
 |  | 
 | define <4 x i32> @test_smull_high_s16_bitcasta1_wrongindex(<2 x i64> %aa, <8 x i16> %b) #0 { | 
 | ; CHECK-LE-LABEL: test_smull_high_s16_bitcasta1_wrongindex: | 
 | ; CHECK-LE:       // %bb.0: // %entry | 
 | ; CHECK-LE-NEXT:    ext v2.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-LE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-LE-NEXT:    ext v0.8b, v0.8b, v2.8b, #4 | 
 | ; CHECK-LE-NEXT:    smull v0.4s, v0.4h, v1.4h | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: test_smull_high_s16_bitcasta1_wrongindex: | 
 | ; CHECK-BE:       // %bb.0: // %entry | 
 | ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h | 
 | ; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #4 | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    smull v0.4s, v0.4h, v1.4h | 
 | ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 | entry: | 
 |   %a = bitcast <2 x i64> %aa to <8 x i16> | 
 |   %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5> | 
 |   %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
 |   %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) | 
 |   ret <4 x i32> %r | 
 | } | 
 |  | 
 | define <4 x i32> @test_smull_high_s16_bitcastb1_wrongindex(<8 x i16> %a, <16 x i8> %bb) #0 { | 
 | ; CHECK-LE-LABEL: test_smull_high_s16_bitcastb1_wrongindex: | 
 | ; CHECK-LE:       // %bb.0: // %entry | 
 | ; CHECK-LE-NEXT:    ext v2.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-LE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-LE-NEXT:    ext v1.8b, v1.8b, v2.8b, #6 | 
 | ; CHECK-LE-NEXT:    smull v0.4s, v0.4h, v1.4h | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: test_smull_high_s16_bitcastb1_wrongindex: | 
 | ; CHECK-BE:       // %bb.0: // %entry | 
 | ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h | 
 | ; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #6 | 
 | ; CHECK-BE-NEXT:    smull v0.4s, v0.4h, v1.4h | 
 | ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 | entry: | 
 |   %b = bitcast <16 x i8> %bb to <8 x i16> | 
 |   %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
 |   %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 3, i32 4, i32 5, i32 6> | 
 |   %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) | 
 |   ret <4 x i32> %r | 
 | } | 
 |  | 
 | define <4 x i32> @test_smull_high_s16_bitcasta2_wrongindex(<4 x i32> %a, <8 x i16> %b) #0 { | 
 | ; CHECK-LE-LABEL: test_smull_high_s16_bitcasta2_wrongindex: | 
 | ; CHECK-LE:       // %bb.0: // %entry | 
 | ; CHECK-LE-NEXT:    ext v0.16b, v0.16b, v0.16b, #4 | 
 | ; CHECK-LE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-LE-NEXT:    smull v0.4s, v0.4h, v1.4h | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: test_smull_high_s16_bitcasta2_wrongindex: | 
 | ; CHECK-BE:       // %bb.0: // %entry | 
 | ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s | 
 | ; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #4 | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h | 
 | ; CHECK-BE-NEXT:    smull v0.4s, v0.4h, v1.4h | 
 | ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 | entry: | 
 |   %s1a = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 1, i32 2> | 
 |   %s1 = bitcast <2 x i32> %s1a to <4 x i16> | 
 |   %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
 |   %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) | 
 |   ret <4 x i32> %r | 
 | } | 
 |  | 
 | define <4 x i32> @test_smull_high_s16_bitcastb2_wrongindex(<8 x i16> %a, <16 x i8> %b) #0 { | 
 | ; CHECK-LE-LABEL: test_smull_high_s16_bitcastb2_wrongindex: | 
 | ; CHECK-LE:       // %bb.0: // %entry | 
 | ; CHECK-LE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-LE-NEXT:    ext v1.16b, v1.16b, v1.16b, #4 | 
 | ; CHECK-LE-NEXT:    smull v0.4s, v0.4h, v1.4h | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: test_smull_high_s16_bitcastb2_wrongindex: | 
 | ; CHECK-BE:       // %bb.0: // %entry | 
 | ; CHECK-BE-NEXT:    rev64 v1.16b, v1.16b | 
 | ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #4 | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    rev16 v1.8b, v1.8b | 
 | ; CHECK-BE-NEXT:    smull v0.4s, v0.4h, v1.4h | 
 | ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 | entry: | 
 |   %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
 |   %s2a = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> | 
 |   %s2 = bitcast <8 x i8> %s2a to <4 x i16> | 
 |   %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) | 
 |   ret <4 x i32> %r | 
 | } | 
 |  | 
 |  | 
 | define <4 x i32> @test_smull_high_s16_splata1(<2 x i64> %aa, <8 x i16> %b) #0 { | 
 | ; CHECK-LE-LABEL: test_smull_high_s16_splata1: | 
 | ; CHECK-LE:       // %bb.0: // %entry | 
 | ; CHECK-LE-NEXT:    smull2 v0.4s, v1.8h, v0.h[3] | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: test_smull_high_s16_splata1: | 
 | ; CHECK-BE:       // %bb.0: // %entry | 
 | ; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h | 
 | ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    smull2 v0.4s, v1.8h, v0.h[3] | 
 | ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 | entry: | 
 |   %a = bitcast <2 x i64> %aa to <8 x i16> | 
 |   %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> | 
 |   %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
 |   %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) | 
 |   ret <4 x i32> %r | 
 | } | 
 |  | 
 | define <4 x i32> @test_smull_high_s16_splatb1(<8 x i16> %a, <16 x i8> %bb) #0 { | 
 | ; CHECK-LE-LABEL: test_smull_high_s16_splatb1: | 
 | ; CHECK-LE:       // %bb.0: // %entry | 
 | ; CHECK-LE-NEXT:    smull2 v0.4s, v0.8h, v1.h[3] | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: test_smull_high_s16_splatb1: | 
 | ; CHECK-BE:       // %bb.0: // %entry | 
 | ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h | 
 | ; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    smull2 v0.4s, v0.8h, v1.h[3] | 
 | ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 | entry: | 
 |   %b = bitcast <16 x i8> %bb to <8 x i16> | 
 |   %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
 |   %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> | 
 |   %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) | 
 |   ret <4 x i32> %r | 
 | } | 
 |  | 
 | define <4 x i32> @test_smull_high_s16_splata2(<4 x i32> %a, <8 x i16> %b) #0 { | 
 | ; CHECK-LE-LABEL: test_smull_high_s16_splata2: | 
 | ; CHECK-LE:       // %bb.0: // %entry | 
 | ; CHECK-LE-NEXT:    dup v0.2s, v0.s[3] | 
 | ; CHECK-LE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-LE-NEXT:    smull v0.4s, v0.4h, v1.4h | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: test_smull_high_s16_splata2: | 
 | ; CHECK-BE:       // %bb.0: // %entry | 
 | ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s | 
 | ; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    dup v0.2s, v0.s[3] | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h | 
 | ; CHECK-BE-NEXT:    smull v0.4s, v0.4h, v1.4h | 
 | ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 | entry: | 
 |   %s1a = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 3, i32 3> | 
 |   %s1 = bitcast <2 x i32> %s1a to <4 x i16> | 
 |   %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
 |   %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) | 
 |   ret <4 x i32> %r | 
 | } | 
 |  | 
 | define <4 x i32> @test_smull_high_s16_splatb2(<8 x i16> %a, <16 x i8> %b) #0 { | 
 | ; CHECK-LE-LABEL: test_smull_high_s16_splatb2: | 
 | ; CHECK-LE:       // %bb.0: // %entry | 
 | ; CHECK-LE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-LE-NEXT:    dup v1.8b, v1.b[3] | 
 | ; CHECK-LE-NEXT:    smull v0.4s, v0.4h, v1.4h | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: test_smull_high_s16_splatb2: | 
 | ; CHECK-BE:       // %bb.0: // %entry | 
 | ; CHECK-BE-NEXT:    rev64 v1.16b, v1.16b | 
 | ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    dup v1.8b, v1.b[3] | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    rev16 v1.8b, v1.8b | 
 | ; CHECK-BE-NEXT:    smull v0.4s, v0.4h, v1.4h | 
 | ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 | entry: | 
 |   %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
 |   %s2a = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> | 
 |   %s2 = bitcast <8 x i8> %s2a to <4 x i16> | 
 |   %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) | 
 |   ret <4 x i32> %r | 
 | } | 
 |  | 
 |  | 
 |  | 
 | define <4 x i32> @test_umull_high_s16_bitcasta1(<2 x i64> %aa, <8 x i16> %b) #0 { | 
 | ; CHECK-LE-LABEL: test_umull_high_s16_bitcasta1: | 
 | ; CHECK-LE:       // %bb.0: // %entry | 
 | ; CHECK-LE-NEXT:    umull2 v0.4s, v0.8h, v1.8h | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: test_umull_high_s16_bitcasta1: | 
 | ; CHECK-BE:       // %bb.0: // %entry | 
 | ; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h | 
 | ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    umull2 v0.4s, v0.8h, v1.8h | 
 | ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 | entry: | 
 |   %a = bitcast <2 x i64> %aa to <8 x i16> | 
 |   %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
 |   %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
 |   %r = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %s1, <4 x i16> %s2) | 
 |   ret <4 x i32> %r | 
 | } | 
 |  | 
 | define <8 x i16> @test_vabdl_high_u82(<16 x i8> %a, <8 x i16> %bb) { | 
 | ; CHECK-LE-LABEL: test_vabdl_high_u82: | 
 | ; CHECK-LE:       // %bb.0: // %entry | 
 | ; CHECK-LE-NEXT:    uabdl2 v0.8h, v0.16b, v1.16b | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: test_vabdl_high_u82: | 
 | ; CHECK-BE:       // %bb.0: // %entry | 
 | ; CHECK-BE-NEXT:    rev64 v0.16b, v0.16b | 
 | ; CHECK-BE-NEXT:    rev64 v1.16b, v1.16b | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    uabdl2 v0.8h, v0.16b, v1.16b | 
 | ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 | entry: | 
 |   %b = bitcast <8 x i16> %bb to <16 x i8> | 
 |   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> | 
 |   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> | 
 |   %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i) | 
 |   %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16> | 
 |   ret <8 x i16> %vmovl.i.i.i | 
 | } | 
 |  | 
 | define <8 x i16> @test_vabdl_high_s82(<16 x i8> %a, <8 x i16> %bb) { | 
 | ; CHECK-LE-LABEL: test_vabdl_high_s82: | 
 | ; CHECK-LE:       // %bb.0: // %entry | 
 | ; CHECK-LE-NEXT:    sabdl2 v0.8h, v0.16b, v1.16b | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: test_vabdl_high_s82: | 
 | ; CHECK-BE:       // %bb.0: // %entry | 
 | ; CHECK-BE-NEXT:    rev64 v0.16b, v0.16b | 
 | ; CHECK-BE-NEXT:    rev64 v1.16b, v1.16b | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    sabdl2 v0.8h, v0.16b, v1.16b | 
 | ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 | entry: | 
 |   %b = bitcast <8 x i16> %bb to <16 x i8> | 
 |   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> | 
 |   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> | 
 |   %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i) | 
 |   %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16> | 
 |   ret <8 x i16> %vmovl.i.i.i | 
 | } | 
 |  | 
 | define <4 x i32> @test_vqdmlal_high_s16_bitcast(<4 x i32> %a, <8 x i16> %b, <16 x i8> %cc) { | 
 | ; CHECK-LE-LABEL: test_vqdmlal_high_s16_bitcast: | 
 | ; CHECK-LE:       // %bb.0: // %entry | 
 | ; CHECK-LE-NEXT:    sqdmlal2 v0.4s, v1.8h, v2.8h | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: test_vqdmlal_high_s16_bitcast: | 
 | ; CHECK-BE:       // %bb.0: // %entry | 
 | ; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h | 
 | ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s | 
 | ; CHECK-BE-NEXT:    rev64 v2.8h, v2.8h | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v2.16b, v2.16b, v2.16b, #8 | 
 | ; CHECK-BE-NEXT:    sqdmlal2 v0.4s, v1.8h, v2.8h | 
 | ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 | entry: | 
 |   %c = bitcast <16 x i8> %cc to <8 x i16> | 
 |   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
 |   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
 |   %vqdmlal2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i) | 
 |   %vqdmlal4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i) | 
 |   ret <4 x i32> %vqdmlal4.i.i | 
 | } | 
 |  | 
 | define <8 x i16> @test_pmull_high_p8_128(i128 %aa, i128 %bb) { | 
 | ; CHECK-LE-LABEL: test_pmull_high_p8_128: | 
 | ; CHECK-LE:       // %bb.0: // %entry | 
 | ; CHECK-LE-NEXT:    fmov d0, x3 | 
 | ; CHECK-LE-NEXT:    fmov d1, x1 | 
 | ; CHECK-LE-NEXT:    pmull v0.8h, v1.8b, v0.8b | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: test_pmull_high_p8_128: | 
 | ; CHECK-BE:       // %bb.0: // %entry | 
 | ; CHECK-BE-NEXT:    fmov d0, x3 | 
 | ; CHECK-BE-NEXT:    fmov d1, x1 | 
 | ; CHECK-BE-NEXT:    rev64 v0.8b, v0.8b | 
 | ; CHECK-BE-NEXT:    rev64 v1.8b, v1.8b | 
 | ; CHECK-BE-NEXT:    pmull v0.8h, v1.8b, v0.8b | 
 | ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 | entry: | 
 |   %a = bitcast i128 %aa to <16 x i8> | 
 |   %b = bitcast i128 %bb to <16 x i8> | 
 |   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> | 
 |   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> | 
 |   %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i) | 
 |   ret <8 x i16> %vmull.i.i | 
 | } | 
 |  | 
 | define <8 x i16> @test_pmull_high_p8_64(<2 x i64> %aa, <2 x i64> %bb) { | 
 | ; CHECK-LE-LABEL: test_pmull_high_p8_64: | 
 | ; CHECK-LE:       // %bb.0: // %entry | 
 | ; CHECK-LE-NEXT:    pmull2 v0.8h, v0.16b, v1.16b | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: test_pmull_high_p8_64: | 
 | ; CHECK-BE:       // %bb.0: // %entry | 
 | ; CHECK-BE-NEXT:    rev64 v0.16b, v0.16b | 
 | ; CHECK-BE-NEXT:    rev64 v1.16b, v1.16b | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    pmull2 v0.8h, v0.16b, v1.16b | 
 | ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 | entry: | 
 |   %a = bitcast <2 x i64> %aa to <16 x i8> | 
 |   %b = bitcast <2 x i64> %bb to <16 x i8> | 
 |   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> | 
 |   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> | 
 |   %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i) | 
 |   ret <8 x i16> %vmull.i.i | 
 | } | 
 |  | 
 | define <8 x i16> @foov8i16(<16 x i8> %a1, <2 x i64> %b1) { | 
 | ; CHECK-LE-LABEL: foov8i16: | 
 | ; CHECK-LE:       // %bb.0: | 
 | ; CHECK-LE-NEXT:    shrn v0.4h, v0.4s, #5 | 
 | ; CHECK-LE-NEXT:    shrn2 v0.8h, v1.4s, #5 | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: foov8i16: | 
 | ; CHECK-BE:       // %bb.0: | 
 | ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s | 
 | ; CHECK-BE-NEXT:    rev64 v1.4s, v1.4s | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    shrn v0.4h, v0.4s, #5 | 
 | ; CHECK-BE-NEXT:    shrn2 v0.8h, v1.4s, #5 | 
 | ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 |   %a0 = bitcast <16 x i8> %a1 to <4 x i32> | 
 |   %b0 = bitcast <2 x i64> %b1 to <4 x i32> | 
 |   %vshrn_low_shift = lshr <4 x i32> %a0, <i32 5, i32 5, i32 5, i32 5> | 
 |   %vshrn_low = trunc <4 x i32> %vshrn_low_shift to <4 x i16> | 
 |   %vshrn_high_shift = lshr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5> | 
 |   %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16> | 
 |   %1 = bitcast <4 x i16> %vshrn_low to <1 x i64> | 
 |   %2 = bitcast <4 x i16> %vshrn_high to <1 x i64> | 
 |   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> | 
 |   %3 = bitcast <2 x i64> %shuffle.i to <8 x i16> | 
 |   ret <8 x i16> %3 | 
 | } | 
 |  | 
 | define <2 x i64> @hadd32_zext_asr(<16 x i8> %src1a) { | 
 | ; CHECK-LE-LABEL: hadd32_zext_asr: | 
 | ; CHECK-LE:       // %bb.0: | 
 | ; CHECK-LE-NEXT:    ushll2 v0.2d, v0.4s, #1 | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: hadd32_zext_asr: | 
 | ; CHECK-BE:       // %bb.0: | 
 | ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ushll2 v0.2d, v0.4s, #1 | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 |   %src1 = bitcast <16 x i8> %src1a to <4 x i32> | 
 |   %s1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> | 
 |   %zextsrc1 = zext <2 x i32> %s1 to <2 x i64> | 
 |   %resulti32 = shl <2 x i64> %zextsrc1, <i64 1, i64 1> | 
 |   ret <2 x i64> %resulti32 | 
 | } | 
 |  | 
 | define <2 x i64> @test_umull_high_s16_splata1(<2 x i64> %aa, <4 x i32> %b) #0 { | 
 | ; CHECK-LE-LABEL: test_umull_high_s16_splata1: | 
 | ; CHECK-LE:       // %bb.0: // %entry | 
 | ; CHECK-LE-NEXT:    umull2 v0.2d, v1.4s, v0.s[1] | 
 | ; CHECK-LE-NEXT:    ret | 
 | ; | 
 | ; CHECK-BE-LABEL: test_umull_high_s16_splata1: | 
 | ; CHECK-BE:       // %bb.0: // %entry | 
 | ; CHECK-BE-NEXT:    rev64 v1.4s, v1.4s | 
 | ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s | 
 | ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8 | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    umull2 v0.2d, v1.4s, v0.s[1] | 
 | ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 | 
 | ; CHECK-BE-NEXT:    ret | 
 | entry: | 
 |   %a = bitcast <2 x i64> %aa to <4 x i32> | 
 |   %s1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 1, i32 1> | 
 |   %s2 = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> | 
 |   %r = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %s1, <2 x i32> %s2) | 
 |   ret <2 x i64> %r | 
 | } |