| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple=aarch64 -mattr=+v8.6a,+neon < %s | FileCheck %s |
| ; RUN: llc -mtriple=aarch64 -mattr=+v8.6a,+neon,+bf16 < %s | FileCheck %s |
| ; RUN: llc -mtriple=aarch64 -mattr=+v8.6a,+neon,+fullfp16,+bf16 < %s | FileCheck %s |
| |
| %struct.float16x4x2_t = type { [2 x <4 x bfloat>] } |
| %struct.float16x8x2_t = type { [2 x <8 x bfloat>] } |
| |
| define dso_local %struct.float16x4x2_t @test_vzip_bf16(<4 x bfloat> %a, <4 x bfloat> %b) { |
| ; CHECK-LABEL: test_vzip_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: zip1 v2.4h, v0.4h, v1.4h |
| ; CHECK-NEXT: zip2 v1.4h, v0.4h, v1.4h |
| ; CHECK-NEXT: fmov d0, d2 |
| ; CHECK-NEXT: ret |
| entry: |
| %vzip.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5> |
| %vzip1.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7> |
| %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x bfloat> %vzip.i, 0, 0 |
| %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vzip1.i, 0, 1 |
| ret %struct.float16x4x2_t %.fca.0.1.insert |
| } |
| |
| define dso_local %struct.float16x8x2_t @test_vzipq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) { |
| ; CHECK-LABEL: test_vzipq_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: zip1 v2.8h, v0.8h, v1.8h |
| ; CHECK-NEXT: zip2 v1.8h, v0.8h, v1.8h |
| ; CHECK-NEXT: mov v0.16b, v2.16b |
| ; CHECK-NEXT: ret |
| entry: |
| %vzip.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> |
| %vzip1.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> |
| %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x bfloat> %vzip.i, 0, 0 |
| %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vzip1.i, 0, 1 |
| ret %struct.float16x8x2_t %.fca.0.1.insert |
| } |
| |
| define dso_local %struct.float16x4x2_t @test_vuzp_bf16(<4 x bfloat> %a, <4 x bfloat> %b) { |
| ; CHECK-LABEL: test_vuzp_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: uzp1 v2.4h, v0.4h, v1.4h |
| ; CHECK-NEXT: uzp2 v1.4h, v0.4h, v1.4h |
| ; CHECK-NEXT: fmov d0, d2 |
| ; CHECK-NEXT: ret |
| entry: |
| %vuzp.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6> |
| %vuzp1.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7> |
| %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x bfloat> %vuzp.i, 0, 0 |
| %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vuzp1.i, 0, 1 |
| ret %struct.float16x4x2_t %.fca.0.1.insert |
| } |
| |
| define dso_local %struct.float16x8x2_t @test_vuzpq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) { |
| ; CHECK-LABEL: test_vuzpq_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: uzp1 v2.8h, v0.8h, v1.8h |
| ; CHECK-NEXT: uzp2 v1.8h, v0.8h, v1.8h |
| ; CHECK-NEXT: mov v0.16b, v2.16b |
| ; CHECK-NEXT: ret |
| entry: |
| %vuzp.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> |
| %vuzp1.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> |
| %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x bfloat> %vuzp.i, 0, 0 |
| %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vuzp1.i, 0, 1 |
| ret %struct.float16x8x2_t %.fca.0.1.insert |
| } |
| |
| define dso_local %struct.float16x4x2_t @test_vtrn_bf16(<4 x bfloat> %a, <4 x bfloat> %b) { |
| ; CHECK-LABEL: test_vtrn_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: trn1 v2.4h, v0.4h, v1.4h |
| ; CHECK-NEXT: trn2 v1.4h, v0.4h, v1.4h |
| ; CHECK-NEXT: fmov d0, d2 |
| ; CHECK-NEXT: ret |
| entry: |
| %vtrn.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6> |
| %vtrn1.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7> |
| %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x bfloat> %vtrn.i, 0, 0 |
| %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vtrn1.i, 0, 1 |
| ret %struct.float16x4x2_t %.fca.0.1.insert |
| } |
| |
| define dso_local %struct.float16x8x2_t @test_vtrnq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) { |
| ; CHECK-LABEL: test_vtrnq_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: trn1 v2.8h, v0.8h, v1.8h |
| ; CHECK-NEXT: trn2 v1.8h, v0.8h, v1.8h |
| ; CHECK-NEXT: mov v0.16b, v2.16b |
| ; CHECK-NEXT: ret |
| entry: |
| %vtrn.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> |
| %vtrn1.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> |
| %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x bfloat> %vtrn.i, 0, 0 |
| %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vtrn1.i, 0, 1 |
| ret %struct.float16x8x2_t %.fca.0.1.insert |
| } |
| |
| define dso_local <4 x bfloat> @test_vmov_n_bf16(float %a.coerce) { |
| ; CHECK-LABEL: test_vmov_n_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 |
| ; CHECK-NEXT: dup v0.4h, v0.h[0] |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = bitcast float %a.coerce to i32 |
| %tmp.0.extract.trunc = trunc i32 %0 to i16 |
| %1 = bitcast i16 %tmp.0.extract.trunc to bfloat |
| %vecinit = insertelement <4 x bfloat> undef, bfloat %1, i32 0 |
| %vecinit4 = shufflevector <4 x bfloat> %vecinit, <4 x bfloat> undef, <4 x i32> zeroinitializer |
| ret <4 x bfloat> %vecinit4 |
| } |
| |
| define dso_local <8 x bfloat> @test_vmovq_n_bf16(float %a.coerce) { |
| ; CHECK-LABEL: test_vmovq_n_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 |
| ; CHECK-NEXT: dup v0.8h, v0.h[0] |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = bitcast float %a.coerce to i32 |
| %tmp.0.extract.trunc = trunc i32 %0 to i16 |
| %1 = bitcast i16 %tmp.0.extract.trunc to bfloat |
| %vecinit = insertelement <8 x bfloat> undef, bfloat %1, i32 0 |
| %vecinit8 = shufflevector <8 x bfloat> %vecinit, <8 x bfloat> undef, <8 x i32> zeroinitializer |
| ret <8 x bfloat> %vecinit8 |
| } |
| |
| define dso_local <4 x bfloat> @test_vdup_n_bf16(float %a.coerce) { |
| ; CHECK-LABEL: test_vdup_n_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 |
| ; CHECK-NEXT: dup v0.4h, v0.h[0] |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = bitcast float %a.coerce to i32 |
| %tmp.0.extract.trunc = trunc i32 %0 to i16 |
| %1 = bitcast i16 %tmp.0.extract.trunc to bfloat |
| %vecinit = insertelement <4 x bfloat> undef, bfloat %1, i32 0 |
| %vecinit4 = shufflevector <4 x bfloat> %vecinit, <4 x bfloat> undef, <4 x i32> zeroinitializer |
| ret <4 x bfloat> %vecinit4 |
| } |
| |
| define dso_local <8 x bfloat> @test_vdupq_n_bf16(float %a.coerce) { |
| ; CHECK-LABEL: test_vdupq_n_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 |
| ; CHECK-NEXT: dup v0.8h, v0.h[0] |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = bitcast float %a.coerce to i32 |
| %tmp.0.extract.trunc = trunc i32 %0 to i16 |
| %1 = bitcast i16 %tmp.0.extract.trunc to bfloat |
| %vecinit = insertelement <8 x bfloat> undef, bfloat %1, i32 0 |
| %vecinit8 = shufflevector <8 x bfloat> %vecinit, <8 x bfloat> undef, <8 x i32> zeroinitializer |
| ret <8 x bfloat> %vecinit8 |
| } |
| |
| define dso_local <4 x bfloat> @test_vdup_lane_bf16(<4 x bfloat> %a) { |
| ; CHECK-LABEL: test_vdup_lane_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 |
| ; CHECK-NEXT: dup v0.4h, v0.h[3] |
| ; CHECK-NEXT: ret |
| entry: |
| %shuffle = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> |
| ret <4 x bfloat> %shuffle |
| } |
| |
| define dso_local <8 x bfloat> @test_vdupq_lane_bf16(<4 x bfloat> %a) { |
| ; CHECK-LABEL: test_vdupq_lane_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 |
| ; CHECK-NEXT: dup v0.8h, v0.h[3] |
| ; CHECK-NEXT: ret |
| entry: |
| %shuffle = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> |
| ret <8 x bfloat> %shuffle |
| } |
| |
| define dso_local <4 x bfloat> @test_vext_bf16(<4 x bfloat> %a, <4 x bfloat> %b) { |
| ; CHECK-LABEL: test_vext_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #4 |
| ; CHECK-NEXT: ret |
| entry: |
| %vext = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5> |
| ret <4 x bfloat> %vext |
| } |
| |
| define dso_local <8 x bfloat> @test_vextq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) { |
| ; CHECK-LABEL: test_vextq_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #10 |
| ; CHECK-NEXT: ret |
| entry: |
| %vext = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12> |
| ret <8 x bfloat> %vext |
| } |
| |
| define dso_local <4 x bfloat> @test_vext_aligned_bf16(<8 x bfloat> %a) { |
| ; CHECK-LABEL: test_vext_aligned_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 |
| ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 |
| ; CHECK-NEXT: ret |
| entry: |
| %vext = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> |
| ret <4 x bfloat> %vext |
| } |
| |
| define dso_local <4 x bfloat> @test_vext_unaligned_bf16(<8 x bfloat> %a) { |
| ; CHECK-LABEL: test_vext_unaligned_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #6 |
| ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 |
| ; CHECK-NEXT: ret |
| entry: |
| %vext = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> <i32 3, i32 4, i32 5, i32 6> |
| ret <4 x bfloat> %vext |
| } |
| |
| define <8 x bfloat> @shuffle3step0_bf16(<32 x bfloat> %src) { |
| ; CHECK-LABEL: shuffle3step0_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: adrp x8, .LCPI16_0 |
| ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 |
| ; CHECK-NEXT: mov v3.16b, v2.16b |
| ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI16_0] |
| ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 |
| ; CHECK-NEXT: adrp x8, .LCPI16_1 |
| ; CHECK-NEXT: tbl v2.16b, { v0.16b, v1.16b }, v4.16b |
| ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI16_1] |
| ; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b }, v0.16b |
| ; CHECK-NEXT: ret |
| entry: |
| %s1 = shufflevector <32 x bfloat> %src, <32 x bfloat> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> |
| ret <8 x bfloat> %s1 |
| } |
| |
| define <8 x bfloat> @shuffle3step1_bf16(<32 x bfloat> %src) { |
| ; CHECK-LABEL: shuffle3step1_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: adrp x8, .LCPI17_0 |
| ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 |
| ; CHECK-NEXT: mov v3.16b, v2.16b |
| ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI17_0] |
| ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 |
| ; CHECK-NEXT: adrp x8, .LCPI17_1 |
| ; CHECK-NEXT: tbl v2.16b, { v0.16b, v1.16b }, v4.16b |
| ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI17_1] |
| ; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b }, v0.16b |
| ; CHECK-NEXT: ret |
| entry: |
| %s1 = shufflevector <32 x bfloat> %src, <32 x bfloat> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> |
| ret <8 x bfloat> %s1 |
| } |
| |
| define <8 x bfloat> @shuffle3step2_bf16(<32 x bfloat> %src) { |
| ; CHECK-LABEL: shuffle3step2_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: adrp x8, .LCPI18_0 |
| ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 |
| ; CHECK-NEXT: mov v3.16b, v2.16b |
| ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI18_0] |
| ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 |
| ; CHECK-NEXT: adrp x8, .LCPI18_1 |
| ; CHECK-NEXT: tbl v2.16b, { v0.16b, v1.16b }, v4.16b |
| ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI18_1] |
| ; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b }, v0.16b |
| ; CHECK-NEXT: ret |
| entry: |
| %s1 = shufflevector <32 x bfloat> %src, <32 x bfloat> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> |
| ret <8 x bfloat> %s1 |
| } |
| |
| |
| define dso_local <4 x bfloat> @test_vrev64_bf16(<4 x bfloat> %a) { |
| ; CHECK-LABEL: test_vrev64_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: rev64 v0.4h, v0.4h |
| ; CHECK-NEXT: ret |
| entry: |
| %shuffle.i = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> |
| ret <4 x bfloat> %shuffle.i |
| } |
| |
| define dso_local <8 x bfloat> @test_vrev64q_bf16(<8 x bfloat> %a) { |
| ; CHECK-LABEL: test_vrev64q_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: rev64 v0.8h, v0.8h |
| ; CHECK-NEXT: ret |
| entry: |
| %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> |
| ret <8 x bfloat> %shuffle.i |
| } |
| |
| define dso_local <4 x bfloat> @test_vrev32_bf16(<4 x bfloat> %a) { |
| ; CHECK-LABEL: test_vrev32_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: rev32 v0.4h, v0.4h |
| ; CHECK-NEXT: ret |
| entry: |
| %shuffle.i = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> |
| ret <4 x bfloat> %shuffle.i |
| } |
| |
| define dso_local <8 x bfloat> @test_vrev32q_bf16(<8 x bfloat> %a) { |
| ; CHECK-LABEL: test_vrev32q_bf16: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: rev32 v0.8h, v0.8h |
| ; CHECK-NEXT: ret |
| entry: |
| %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> |
| ret <8 x bfloat> %shuffle.i |
| } |
| |
| define <4 x bfloat> @test_vld_dup1_4xbfloat(ptr %b) { |
| ; CHECK-LABEL: test_vld_dup1_4xbfloat: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: ld1r { v0.4h }, [x0] |
| ; CHECK-NEXT: ret |
| entry: |
| %b1 = load bfloat, ptr %b, align 2 |
| %vecinit = insertelement <4 x bfloat> undef, bfloat %b1, i32 0 |
| %vecinit2 = insertelement <4 x bfloat> %vecinit, bfloat %b1, i32 1 |
| %vecinit3 = insertelement <4 x bfloat> %vecinit2, bfloat %b1, i32 2 |
| %vecinit4 = insertelement <4 x bfloat> %vecinit3, bfloat %b1, i32 3 |
| ret <4 x bfloat> %vecinit4 |
| } |
| |
| define <8 x bfloat> @test_vld_dup1_8xbfloat(ptr %b) local_unnamed_addr { |
| ; CHECK-LABEL: test_vld_dup1_8xbfloat: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: ld1r { v0.8h }, [x0] |
| ; CHECK-NEXT: ret |
| entry: |
| %b1 = load bfloat, ptr %b, align 2 |
| %vecinit = insertelement <8 x bfloat> undef, bfloat %b1, i32 0 |
| %vecinit8 = shufflevector <8 x bfloat> %vecinit, <8 x bfloat> undef, <8 x i32> zeroinitializer |
| ret <8 x bfloat> %vecinit8 |
| } |
| |
| define <8 x bfloat> @test_shufflevector8xbfloat(<4 x bfloat> %a) { |
| ; CHECK-LABEL: test_shufflevector8xbfloat: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 |
| ; CHECK-NEXT: mov v0.d[1], v0.d[0] |
| ; CHECK-NEXT: ret |
| entry: |
| %r = shufflevector <4 x bfloat> %a, <4 x bfloat> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| ret <8 x bfloat> %r |
| } |
| |