blob: 8aab45c4dce57edc73e536caeb9555fd66d1df1a [file]
; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s --check-prefix=MAX-BANDWIDTH
; RUN: opt -mattr=+simd128,+relaxed-simd -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128,+relaxed-simd -verify-machineinstrs -o - | FileCheck %s --check-prefix=RELAXED-MAX-BANDWIDTH
target triple = "wasm32"
; CHECK-LABEL: bb2053_inner_loop:
; CHECK: loop
; CHECK: v128.load
; CHECK: i32x4.extract_lane 3
; CHECK: i32x4.extract_lane 2
; CHECK: i32x4.extract_lane 1
; CHECK: i32x4.extract_lane 0
; CHECK: v128.load8_splat 0
; CHECK: v128.load8_lane 0, 1
; CHECK: v128.load8_lane 0, 2
; CHECK: v128.load8_lane 0, 3
; CHECK: i16x8.extend_low_i8x16_s
; CHECK: v128.load
; CHECK: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK: i16x8.extend_low_i8x16_s
; CHECK: i32x4.extmul_low_i16x8_s
; CHECK: i32x4.add
; CHECK: v128.load8_splat 0
; CHECK: v128.load8_lane 0, 1
; CHECK: v128.load8_lane 0, 2
; CHECK: v128.load8_lane 0, 3
; CHECK: i16x8.extend_low_i8x16_s
; CHECK: i8x16.shuffle 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK: i16x8.extend_low_i8x16_s
; CHECK: i32x4.extmul_low_i16x8_s
; CHECK: i32x4.add
; CHECK: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK: i16x8.extend_low_i8x16_s
; CHECK: i32x4.extmul_low_i16x8_s
; CHECK: i32x4.add
; CHECK: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK: i16x8.extend_low_i8x16_s
; CHECK: i32x4.extmul_low_i16x8_s
; CHECK: i32x4.add
; CHECK: v128.load8_splat 0
; CHECK: v128.load8_lane 0, 1
; CHECK: v128.load8_lane 0, 2
; CHECK: v128.load8_lane 0, 3
; CHECK: i16x8.extend_low_i8x16_s
; CHECK: i32x4.extmul_low_i16x8_s
; CHECK: i32x4.add
; CHECK: v128.load8_splat 0
; CHECK: v128.load8_lane 0, 1
; CHECK: v128.load8_lane 0, 2
; CHECK: v128.load8_lane 0, 3
; CHECK: i16x8.extend_low_i8x16_s
; CHECK: i32x4.extmul_low_i16x8_s
; CHECK: i32x4.add
; CHECK: i32x4.extmul_low_i16x8_s
; CHECK: i32x4.add
; CHECK: i32x4.extmul_low_i16x8_s
; CHECK: i32x4.add
; MAX-BANDWIDTH: loop
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: i32x4.extract_lane 3
; MAX-BANDWIDTH: i32x4.extract_lane 2
; MAX-BANDWIDTH: i32x4.extract_lane 1
; MAX-BANDWIDTH: i32x4.extract_lane 0
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: i32x4.extract_lane 3
; MAX-BANDWIDTH: i32x4.extract_lane 2
; MAX-BANDWIDTH: i32x4.extract_lane 1
; MAX-BANDWIDTH: i32x4.extract_lane 0
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: i32x4.extract_lane 3
; MAX-BANDWIDTH: i32x4.extract_lane 2
; MAX-BANDWIDTH: i32x4.extract_lane 1
; MAX-BANDWIDTH: i32x4.extract_lane 0
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: i32x4.extract_lane 3
; MAX-BANDWIDTH: i32x4.extract_lane 2
; MAX-BANDWIDTH: i32x4.extract_lane 1
; MAX-BANDWIDTH: i32x4.extract_lane 0
; MAX-BANDWIDTH: v128.load8_splat 0
; MAX-BANDWIDTH: v128.load8_lane 0, 1
; MAX-BANDWIDTH: v128.load8_lane 0, 2
; MAX-BANDWIDTH: v128.load8_lane 0, 3
; MAX-BANDWIDTH: v128.load8_lane 0, 4
; MAX-BANDWIDTH: v128.load8_lane 0, 5
; MAX-BANDWIDTH: v128.load8_lane 0, 6
; MAX-BANDWIDTH: v128.load8_lane 0, 7
; MAX-BANDWIDTH: v128.load8_lane 0, 8
; MAX-BANDWIDTH: v128.load8_lane 0, 9
; MAX-BANDWIDTH: v128.load8_lane 0, 10
; MAX-BANDWIDTH: v128.load8_lane 0, 11
; MAX-BANDWIDTH: v128.load8_lane 0, 12
; MAX-BANDWIDTH: v128.load8_lane 0, 13
; MAX-BANDWIDTH: v128.load8_lane 0, 14
; MAX-BANDWIDTH: v128.load8_lane 0, 15
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 3, 7, 11, 15, 19, 23, 27, 31
; MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: v128.load8_splat 0
; MAX-BANDWIDTH: v128.load8_lane 0, 1
; MAX-BANDWIDTH: v128.load8_lane 0, 2
; MAX-BANDWIDTH: v128.load8_lane 0, 3
; MAX-BANDWIDTH: v128.load8_lane 0, 4
; MAX-BANDWIDTH: v128.load8_lane 0, 5
; MAX-BANDWIDTH: v128.load8_lane 0, 6
; MAX-BANDWIDTH: v128.load8_lane 0, 7
; MAX-BANDWIDTH: v128.load8_lane 0, 8
; MAX-BANDWIDTH: v128.load8_lane 0, 9
; MAX-BANDWIDTH: v128.load8_lane 0, 10
; MAX-BANDWIDTH: v128.load8_lane 0, 11
; MAX-BANDWIDTH: v128.load8_lane 0, 12
; MAX-BANDWIDTH: v128.load8_lane 0, 13
; MAX-BANDWIDTH: v128.load8_lane 0, 14
; MAX-BANDWIDTH: v128.load8_lane 0, 15
; MAX-BANDWIDTH: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
; MAX-BANDWIDTH: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 9, 13, 17, 21, 25, 29
; MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
; MAX-BANDWIDTH: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 10, 14, 18, 22, 26, 30
; MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
; MAX-BANDWIDTH: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 8, 12, 16, 20, 24, 28
; MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: v128.load8_splat 0
; MAX-BANDWIDTH: v128.load8_lane 0, 1
; MAX-BANDWIDTH: v128.load8_lane 0, 2
; MAX-BANDWIDTH: v128.load8_lane 0, 3
; MAX-BANDWIDTH: v128.load8_lane 0, 4
; MAX-BANDWIDTH: v128.load8_lane 0, 5
; MAX-BANDWIDTH: v128.load8_lane 0, 6
; MAX-BANDWIDTH: v128.load8_lane 0, 7
; MAX-BANDWIDTH: v128.load8_lane 0, 8
; MAX-BANDWIDTH: v128.load8_lane 0, 9
; MAX-BANDWIDTH: v128.load8_lane 0, 10
; MAX-BANDWIDTH: v128.load8_lane 0, 11
; MAX-BANDWIDTH: v128.load8_lane 0, 12
; MAX-BANDWIDTH: v128.load8_lane 0, 13
; MAX-BANDWIDTH: v128.load8_lane 0, 14
; MAX-BANDWIDTH: v128.load8_lane 0, 15
; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: v128.load8_splat 0
; MAX-BANDWIDTH: v128.load8_lane 0, 1
; MAX-BANDWIDTH: v128.load8_lane 0, 2
; MAX-BANDWIDTH: v128.load8_lane 0, 3
; MAX-BANDWIDTH: v128.load8_lane 0, 4
; MAX-BANDWIDTH: v128.load8_lane 0, 5
; MAX-BANDWIDTH: v128.load8_lane 0, 6
; MAX-BANDWIDTH: v128.load8_lane 0, 7
; MAX-BANDWIDTH: v128.load8_lane 0, 8
; MAX-BANDWIDTH: v128.load8_lane 0, 9
; MAX-BANDWIDTH: v128.load8_lane 0, 10
; MAX-BANDWIDTH: v128.load8_lane 0, 11
; MAX-BANDWIDTH: v128.load8_lane 0, 12
; MAX-BANDWIDTH: v128.load8_lane 0, 13
; MAX-BANDWIDTH: v128.load8_lane 0, 14
; MAX-BANDWIDTH: v128.load8_lane 0, 15
; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i32x4.add
; RELAXED-MAX-BANDWIDTH: loop
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 3
; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 2
; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 1
; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 0
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 3
; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 2
; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 1
; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 0
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 3
; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 2
; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 1
; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 0
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 3
; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 2
; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 1
; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 0
; RELAXED-MAX-BANDWIDTH: v128.load8_splat 0
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 1
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 2
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 3
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 4
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 5
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 6
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 7
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 8
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 9
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 10
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 11
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 12
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 13
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 14
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 15
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 3, 7, 11, 15, 19, 23, 27, 31
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; RELAXED-MAX-BANDWIDTH: v128.load8_splat 0
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 1
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 2
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 3
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 4
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 5
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 6
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 7
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 8
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 9
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 10
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 11
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 12
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 13
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 14
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 15
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 9, 13, 17, 21, 25, 29
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 10, 14, 18, 22, 26, 30
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 8, 12, 16, 20, 24, 28
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH: v128.load8_splat 0
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 1
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 2
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 3
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 4
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 5
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 6
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 7
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 8
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 9
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 10
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 11
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 12
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 13
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 14
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 15
; RELAXED-MAX-BANDWIDTH: v128.load8_splat 0
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 1
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 2
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 3
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 4
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 5
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 6
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 7
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 8
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 9
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 10
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 11
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 12
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 13
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 14
; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 15
; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
define hidden { i32, i32, i32, i32 } @bb2053_inner_loop(ptr nocapture %base0, ptr nocapture %base1, ptr nocapture %weights, ptr nocapture readonly %indices, i32 %len, i32 %stride, i32 %acc0, i32 %acc1, i32 %acc2, i32 %acc3) local_unnamed_addr {
entry:
br label %bb2053.loop
bb2053.loop:
%idx = phi i32 [ 0, %entry ], [ %idx.next, %bb2053.loop ]
%accA = phi i32 [ %acc0, %entry ], [ %accA.sum, %bb2053.loop ]
%accB = phi i32 [ %acc1, %entry ], [ %accB.sum, %bb2053.loop ]
%accC = phi i32 [ %acc2, %entry ], [ %accC.sum, %bb2053.loop ]
%accD = phi i32 [ %acc3, %entry ], [ %accD.sum, %bb2053.loop ]
%wptr = phi ptr [ %weights, %entry ], [ %wptr.next, %bb2053.loop ]
%idx.ptr = getelementptr inbounds nuw i32, ptr %indices, i32 %idx
%idx.val = load i32, ptr %idx.ptr, align 4
%lhs0.ptr = getelementptr inbounds i8, ptr %base0, i32 %idx.val
%rhs0.ptr = getelementptr inbounds i8, ptr %base1, i32 %idx.val
%lhs0 = load i8, ptr %lhs0.ptr, align 1
%lhs0.sext = sext i8 %lhs0 to i32
%w0 = load i8, ptr %wptr, align 1
%w0.sext = sext i8 %w0 to i32
%mul0 = mul nsw i32 %w0.sext, %lhs0.sext
%accA.next = add nsw i32 %mul0, %accA
%w1.ptr = getelementptr inbounds nuw i8, ptr %wptr, i32 1
%w1 = load i8, ptr %w1.ptr, align 1
%w1.sext = sext i8 %w1 to i32
%mul1 = mul nsw i32 %w1.sext, %lhs0.sext
%accC.next = add nsw i32 %mul1, %accC
%lhs1.ptr = getelementptr inbounds nuw i8, ptr %lhs0.ptr, i32 %stride
%lhs1 = load i8, ptr %lhs1.ptr, align 1
%lhs1.sext = sext i8 %lhs1 to i32
%w2.ptr = getelementptr inbounds nuw i8, ptr %wptr, i32 2
%w2 = load i8, ptr %w2.ptr, align 1
%w2.sext = sext i8 %w2 to i32
%mul2 = mul nsw i32 %w2.sext, %lhs1.sext
%accA.sum = add nsw i32 %accA.next, %mul2
%w3.ptr = getelementptr inbounds nuw i8, ptr %wptr, i32 3
%w3 = load i8, ptr %w3.ptr, align 1
%w3.sext = sext i8 %w3 to i32
%mul3 = mul nsw i32 %w3.sext, %lhs1.sext
%accC.sum = add nsw i32 %accC.next, %mul3
%rhs0 = load i8, ptr %rhs0.ptr, align 1
%rhs0.sext = sext i8 %rhs0 to i32
%mul4 = mul nsw i32 %rhs0.sext, %w0.sext
%accB.next = add nsw i32 %mul4, %accB
%mul5 = mul nsw i32 %rhs0.sext, %w1.sext
%accD.next = add nsw i32 %mul5, %accD
%rhs1.ptr = getelementptr inbounds nuw i8, ptr %rhs0.ptr, i32 %stride
%rhs1 = load i8, ptr %rhs1.ptr, align 1
%rhs1.sext = sext i8 %rhs1 to i32
%mul6 = mul nsw i32 %rhs1.sext, %w2.sext
%accB.sum = add nsw i32 %accB.next, %mul6
%mul7 = mul nsw i32 %rhs1.sext, %w3.sext
%accD.sum = add nsw i32 %accD.next, %mul7
%wptr.next = getelementptr inbounds nuw i8, ptr %wptr, i32 4
%idx.next = add nuw nsw i32 %idx, 1
%exit = icmp eq i32 %idx.next, %len
br i1 %exit, label %bb2053.exit, label %bb2053.loop
bb2053.exit:
%res0 = insertvalue { i32, i32, i32, i32 } poison, i32 %accA.sum, 0
%res1 = insertvalue { i32, i32, i32, i32 } %res0, i32 %accB.sum, 1
%res2 = insertvalue { i32, i32, i32, i32 } %res1, i32 %accC.sum, 2
%res3 = insertvalue { i32, i32, i32, i32 } %res2, i32 %accD.sum, 3
ret { i32, i32, i32, i32 } %res3
}
; CHECK-LABEL: bb41_inner_loop:
; CHECK: loop
; CHECK: v128.load64_zero
; CHECK: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK: i16x8.extend_low_i8x16_s
; CHECK: v128.load64_zero
; CHECK: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK: i16x8.extend_low_i8x16_s
; CHECK: i32x4.extmul_low_i16x8_s
; CHECK: i32x4.add
; CHECK: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK: i16x8.extend_low_i8x16_s
; CHECK: i32x4.extmul_low_i16x8_s
; CHECK: i32x4.add
; CHECK: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK: i16x8.extend_low_i8x16_s
; CHECK: i32x4.extmul_low_i16x8_s
; CHECK: i32x4.add
; CHECK: i32x4.extmul_low_i16x8_s
; CHECK: i32x4.add
; MAX-BANDWIDTH: loop
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: i8x16.shuffle 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: i8x16.shuffle 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
; RELAXED-MAX-BANDWIDTH: loop
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
define hidden { i32, i32, i32, i32 } @bb41_inner_loop(ptr nocapture %lhs, ptr nocapture %rhs, i32 %len, i32 %acc00, i32 %acc01, i32 %acc10, i32 %acc11) local_unnamed_addr {
entry:
br label %bb41
bb41:
%idx = phi i32 [ 0, %entry ], [ %idx.next, %bb41 ]
%lhs.ptr = phi ptr [ %lhs, %entry ], [ %lhs.next, %bb41 ]
%rhs.ptr = phi ptr [ %rhs, %entry ], [ %rhs.next, %bb41 ]
%acc00.phi = phi i32 [ %acc00, %entry ], [ %acc00.next, %bb41 ]
%acc01.phi = phi i32 [ %acc01, %entry ], [ %acc01.next, %bb41 ]
%acc10.phi = phi i32 [ %acc10, %entry ], [ %acc10.next, %bb41 ]
%acc11.phi = phi i32 [ %acc11, %entry ], [ %acc11.next, %bb41 ]
%lhs0 = load i8, ptr %lhs.ptr, align 1
%lhs0.sext = sext i8 %lhs0 to i32
%rhs0 = load i8, ptr %rhs.ptr, align 1
%rhs0.sext = sext i8 %rhs0 to i32
%mul00 = mul nsw i32 %rhs0.sext, %lhs0.sext
%acc00.next = add nsw i32 %mul00, %acc00.phi
%rhs1.ptr = getelementptr inbounds nuw i8, ptr %rhs.ptr, i32 1
%rhs1 = load i8, ptr %rhs1.ptr, align 1
%rhs1.sext = sext i8 %rhs1 to i32
%mul01 = mul nsw i32 %rhs1.sext, %lhs0.sext
%acc01.next = add nsw i32 %mul01, %acc01.phi
%lhs1.ptr = getelementptr inbounds nuw i8, ptr %lhs.ptr, i32 1
%lhs1 = load i8, ptr %lhs1.ptr, align 1
%lhs1.sext = sext i8 %lhs1 to i32
%mul10 = mul nsw i32 %lhs1.sext, %rhs0.sext
%acc10.next = add nsw i32 %mul10, %acc10.phi
%mul11 = mul nsw i32 %lhs1.sext, %rhs1.sext
%acc11.next = add nsw i32 %mul11, %acc11.phi
%lhs.next = getelementptr inbounds nuw i8, ptr %lhs.ptr, i32 2
%rhs.next = getelementptr inbounds nuw i8, ptr %rhs.ptr, i32 2
%idx.next = add nuw nsw i32 %idx, 1
%exit = icmp eq i32 %idx.next, %len
br i1 %exit, label %bb41.exit, label %bb41
bb41.exit:
%res0 = insertvalue { i32, i32, i32, i32 } poison, i32 %acc00.next, 0
%res1 = insertvalue { i32, i32, i32, i32 } %res0, i32 %acc01.next, 1
%res2 = insertvalue { i32, i32, i32, i32 } %res1, i32 %acc10.next, 2
%res3 = insertvalue { i32, i32, i32, i32 } %res2, i32 %acc11.next, 3
ret { i32, i32, i32, i32 } %res3
}
; CHECK-LABEL: bb41_inner_loop_i16:
; CHECK: loop
; CHECK: v128.load
; CHECK: v128.load
; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
; CHECK: v128.load
; CHECK: v128.load
; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
; CHECK: i32x4.extmul_low_i16x8_s
; CHECK: i32x4.add
; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
; CHECK: i32x4.extmul_low_i16x8_s
; CHECK: i32x4.add
; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
; CHECK: i32x4.extmul_low_i16x8_s
; CHECK: i32x4.add
; CHECK: i32x4.extmul_low_i16x8_s
; CHECK: i32x4.add
; MAX-BANDWIDTH: loop
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 12, 13, 20, 21, 28, 29
; MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 12, 13, 20, 21, 28, 29
; MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; MAX-BANDWIDTH: i32x4.dot_i16x8_s
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
; MAX-BANDWIDTH: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 16, 17, 24, 25
; MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; MAX-BANDWIDTH: i32x4.dot_i16x8_s
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
; MAX-BANDWIDTH: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 16, 17, 24, 25
; MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; MAX-BANDWIDTH: i32x4.dot_i16x8_s
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.dot_i16x8_s
; MAX-BANDWIDTH: i32x4.add
; RELAXED-MAX-BANDWIDTH: loop
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 12, 13, 20, 21, 28, 29
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 12, 13, 20, 21, 28, 29
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; RELAXED-MAX-BANDWIDTH: i32x4.dot_i16x8_s
; RELAXED-MAX-BANDWIDTH: i32x4.add
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 16, 17, 24, 25
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; RELAXED-MAX-BANDWIDTH: i32x4.dot_i16x8_s
; RELAXED-MAX-BANDWIDTH: i32x4.add
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 16, 17, 24, 25
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; RELAXED-MAX-BANDWIDTH: i32x4.dot_i16x8_s
; RELAXED-MAX-BANDWIDTH: i32x4.add
; RELAXED-MAX-BANDWIDTH: i32x4.dot_i16x8_s
; RELAXED-MAX-BANDWIDTH: i32x4.add
define hidden { i32, i32, i32, i32 } @bb41_inner_loop_i16(ptr nocapture %lhs, ptr nocapture %rhs, i32 %len, i32 %acc00, i32 %acc01, i32 %acc10, i32 %acc11) local_unnamed_addr {
entry:
br label %bb41
bb41:
%idx = phi i32 [ 0, %entry ], [ %idx.next, %bb41 ]
%lhs.ptr = phi ptr [ %lhs, %entry ], [ %lhs.next, %bb41 ]
%rhs.ptr = phi ptr [ %rhs, %entry ], [ %rhs.next, %bb41 ]
%acc00.phi = phi i32 [ %acc00, %entry ], [ %acc00.next, %bb41 ]
%acc01.phi = phi i32 [ %acc01, %entry ], [ %acc01.next, %bb41 ]
%acc10.phi = phi i32 [ %acc10, %entry ], [ %acc10.next, %bb41 ]
%acc11.phi = phi i32 [ %acc11, %entry ], [ %acc11.next, %bb41 ]
%lhs0 = load i16, ptr %lhs.ptr, align 2
%lhs0.sext = sext i16 %lhs0 to i32
%rhs0 = load i16, ptr %rhs.ptr, align 2
%rhs0.sext = sext i16 %rhs0 to i32
%mul00 = mul nsw i32 %rhs0.sext, %lhs0.sext
%acc00.next = add nsw i32 %mul00, %acc00.phi
%rhs1.ptr = getelementptr inbounds nuw i16, ptr %rhs.ptr, i32 2
%rhs1 = load i16, ptr %rhs1.ptr, align 2
%rhs1.sext = sext i16 %rhs1 to i32
%mul01 = mul nsw i32 %rhs1.sext, %lhs0.sext
%acc01.next = add nsw i32 %mul01, %acc01.phi
%lhs1.ptr = getelementptr inbounds nuw i16, ptr %lhs.ptr, i32 2
%lhs1 = load i16, ptr %lhs1.ptr, align 2
%lhs1.sext = sext i16 %lhs1 to i32
%mul10 = mul nsw i32 %lhs1.sext, %rhs0.sext
%acc10.next = add nsw i32 %mul10, %acc10.phi
%mul11 = mul nsw i32 %lhs1.sext, %rhs1.sext
%acc11.next = add nsw i32 %mul11, %acc11.phi
%lhs.next = getelementptr inbounds nuw i16, ptr %lhs.ptr, i32 4
%rhs.next = getelementptr inbounds nuw i16, ptr %rhs.ptr, i32 4
%idx.next = add nuw nsw i32 %idx, 1
%exit = icmp eq i32 %idx.next, %len
br i1 %exit, label %bb41.exit, label %bb41
bb41.exit:
%res0 = insertvalue { i32, i32, i32, i32 } poison, i32 %acc00.next, 0
%res1 = insertvalue { i32, i32, i32, i32 } %res0, i32 %acc01.next, 1
%res2 = insertvalue { i32, i32, i32, i32 } %res1, i32 %acc10.next, 2
%res3 = insertvalue { i32, i32, i32, i32 } %res2, i32 %acc11.next, 3
ret { i32, i32, i32, i32 } %res3
}