blob: d96b89f9763d88df1047d3c782dd354cec4df7d0 [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s --check-prefix=MAX-BANDWIDTH
; RUN: opt -mattr=+simd128,+relaxed-simd -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128,+relaxed-simd -verify-machineinstrs -o - | FileCheck %s --check-prefix=RELAXED-MAX-BANDWIDTH
target triple = "wasm32"
define hidden { i32, i32, i32, i32 } @bb2053_inner_loop(ptr nocapture %base0, ptr nocapture %base1, ptr nocapture %weights, ptr nocapture readonly %indices, i32 %len, i32 %stride, i32 %acc0, i32 %acc1, i32 %acc2, i32 %acc3) local_unnamed_addr {
; CHECK-LABEL: bb2053_inner_loop:
; CHECK: .functype bb2053_inner_loop (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
; CHECK-NEXT: .local i32, i32, v128, v128, v128, v128, v128, i32, i32, i32, i32, i32, i32, v128, v128, v128, v128, v128
; CHECK-NEXT: # %bb.0: # %entry
; CHECK-NEXT: i32.const 0
; CHECK-NEXT: local.set 11
; CHECK-NEXT: block
; CHECK-NEXT: block
; CHECK-NEXT: block
; CHECK-NEXT: local.get 5
; CHECK-NEXT: i32.const 4
; CHECK-NEXT: i32.ge_u
; CHECK-NEXT: br_if 0 # 0: down to label2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: local.get 3
; CHECK-NEXT: local.set 12
; CHECK-NEXT: br 1 # 1: down to label1
; CHECK-NEXT: .LBB0_2: # %vector.ph
; CHECK-NEXT: end_block # label2:
; CHECK-NEXT: v128.const 0, 0, 0, 0
; CHECK-NEXT: local.tee 13
; CHECK-NEXT: local.get 10
; CHECK-NEXT: i32x4.replace_lane 0
; CHECK-NEXT: local.set 14
; CHECK-NEXT: local.get 13
; CHECK-NEXT: local.get 9
; CHECK-NEXT: i32x4.replace_lane 0
; CHECK-NEXT: local.set 15
; CHECK-NEXT: local.get 13
; CHECK-NEXT: local.get 8
; CHECK-NEXT: i32x4.replace_lane 0
; CHECK-NEXT: local.set 16
; CHECK-NEXT: local.get 13
; CHECK-NEXT: local.get 7
; CHECK-NEXT: i32x4.replace_lane 0
; CHECK-NEXT: local.set 17
; CHECK-NEXT: local.get 3
; CHECK-NEXT: local.get 5
; CHECK-NEXT: i32.const -4
; CHECK-NEXT: i32.and
; CHECK-NEXT: local.tee 11
; CHECK-NEXT: i32.const 2
; CHECK-NEXT: i32.shl
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 12
; CHECK-NEXT: local.get 4
; CHECK-NEXT: local.set 10
; CHECK-NEXT: local.get 11
; CHECK-NEXT: local.set 9
; CHECK-NEXT: .LBB0_3: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: loop # label3:
; CHECK-NEXT: local.get 2
; CHECK-NEXT: local.get 10
; CHECK-NEXT: v128.load 0:p2align=2
; CHECK-NEXT: local.tee 13
; CHECK-NEXT: i32x4.extract_lane 3
; CHECK-NEXT: local.tee 8
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.tee 7
; CHECK-NEXT: local.get 2
; CHECK-NEXT: local.get 13
; CHECK-NEXT: i32x4.extract_lane 2
; CHECK-NEXT: local.tee 18
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.tee 19
; CHECK-NEXT: local.get 2
; CHECK-NEXT: local.get 13
; CHECK-NEXT: i32x4.extract_lane 1
; CHECK-NEXT: local.tee 20
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.tee 21
; CHECK-NEXT: local.get 2
; CHECK-NEXT: local.get 13
; CHECK-NEXT: i32x4.extract_lane 0
; CHECK-NEXT: local.tee 22
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.tee 23
; CHECK-NEXT: v128.load8_splat 0
; CHECK-NEXT: v128.load8_lane 0, 1
; CHECK-NEXT: v128.load8_lane 0, 2
; CHECK-NEXT: v128.load8_lane 0, 3
; CHECK-NEXT: i16x8.extend_low_i8x16_s
; CHECK-NEXT: local.tee 24
; CHECK-NEXT: local.get 3
; CHECK-NEXT: v128.load 0:p2align=0
; CHECK-NEXT: local.tee 13
; CHECK-NEXT: local.get 13
; CHECK-NEXT: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: i16x8.extend_low_i8x16_s
; CHECK-NEXT: local.tee 25
; CHECK-NEXT: i32x4.extmul_low_i16x8_s
; CHECK-NEXT: local.get 14
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.get 7
; CHECK-NEXT: local.get 6
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.get 19
; CHECK-NEXT: local.get 6
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.get 21
; CHECK-NEXT: local.get 6
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.get 23
; CHECK-NEXT: local.get 6
; CHECK-NEXT: i32.add
; CHECK-NEXT: v128.load8_splat 0
; CHECK-NEXT: v128.load8_lane 0, 1
; CHECK-NEXT: v128.load8_lane 0, 2
; CHECK-NEXT: v128.load8_lane 0, 3
; CHECK-NEXT: i16x8.extend_low_i8x16_s
; CHECK-NEXT: local.tee 26
; CHECK-NEXT: local.get 13
; CHECK-NEXT: local.get 13
; CHECK-NEXT: i8x16.shuffle 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: i16x8.extend_low_i8x16_s
; CHECK-NEXT: local.tee 27
; CHECK-NEXT: i32x4.extmul_low_i16x8_s
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.set 14
; CHECK-NEXT: local.get 24
; CHECK-NEXT: local.get 13
; CHECK-NEXT: local.get 13
; CHECK-NEXT: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: i16x8.extend_low_i8x16_s
; CHECK-NEXT: local.tee 28
; CHECK-NEXT: i32x4.extmul_low_i16x8_s
; CHECK-NEXT: local.get 16
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.get 26
; CHECK-NEXT: local.get 13
; CHECK-NEXT: local.get 13
; CHECK-NEXT: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: i16x8.extend_low_i8x16_s
; CHECK-NEXT: local.tee 13
; CHECK-NEXT: i32x4.extmul_low_i16x8_s
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.set 16
; CHECK-NEXT: local.get 25
; CHECK-NEXT: local.get 1
; CHECK-NEXT: local.get 8
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.tee 8
; CHECK-NEXT: local.get 1
; CHECK-NEXT: local.get 18
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.tee 7
; CHECK-NEXT: local.get 1
; CHECK-NEXT: local.get 20
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.tee 18
; CHECK-NEXT: local.get 1
; CHECK-NEXT: local.get 22
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.tee 19
; CHECK-NEXT: v128.load8_splat 0
; CHECK-NEXT: v128.load8_lane 0, 1
; CHECK-NEXT: v128.load8_lane 0, 2
; CHECK-NEXT: v128.load8_lane 0, 3
; CHECK-NEXT: i16x8.extend_low_i8x16_s
; CHECK-NEXT: local.tee 24
; CHECK-NEXT: i32x4.extmul_low_i16x8_s
; CHECK-NEXT: local.get 15
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.get 27
; CHECK-NEXT: local.get 8
; CHECK-NEXT: local.get 6
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.get 7
; CHECK-NEXT: local.get 6
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.get 18
; CHECK-NEXT: local.get 6
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.get 19
; CHECK-NEXT: local.get 6
; CHECK-NEXT: i32.add
; CHECK-NEXT: v128.load8_splat 0
; CHECK-NEXT: v128.load8_lane 0, 1
; CHECK-NEXT: v128.load8_lane 0, 2
; CHECK-NEXT: v128.load8_lane 0, 3
; CHECK-NEXT: i16x8.extend_low_i8x16_s
; CHECK-NEXT: local.tee 25
; CHECK-NEXT: i32x4.extmul_low_i16x8_s
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.set 15
; CHECK-NEXT: local.get 28
; CHECK-NEXT: local.get 24
; CHECK-NEXT: i32x4.extmul_low_i16x8_s
; CHECK-NEXT: local.get 17
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.get 13
; CHECK-NEXT: local.get 25
; CHECK-NEXT: i32x4.extmul_low_i16x8_s
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.set 17
; CHECK-NEXT: local.get 3
; CHECK-NEXT: i32.const 16
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 3
; CHECK-NEXT: local.get 10
; CHECK-NEXT: i32.const 16
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 10
; CHECK-NEXT: local.get 9
; CHECK-NEXT: i32.const -4
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.tee 9
; CHECK-NEXT: br_if 0 # 0: up to label3
; CHECK-NEXT: # %bb.4: # %middle.block
; CHECK-NEXT: end_loop
; CHECK-NEXT: local.get 14
; CHECK-NEXT: local.get 14
; CHECK-NEXT: local.get 13
; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.tee 13
; CHECK-NEXT: local.get 13
; CHECK-NEXT: local.get 13
; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: i32x4.extract_lane 0
; CHECK-NEXT: local.set 10
; CHECK-NEXT: local.get 15
; CHECK-NEXT: local.get 15
; CHECK-NEXT: local.get 13
; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.tee 13
; CHECK-NEXT: local.get 13
; CHECK-NEXT: local.get 13
; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: i32x4.extract_lane 0
; CHECK-NEXT: local.set 9
; CHECK-NEXT: local.get 16
; CHECK-NEXT: local.get 16
; CHECK-NEXT: local.get 13
; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.tee 13
; CHECK-NEXT: local.get 13
; CHECK-NEXT: local.get 13
; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: i32x4.extract_lane 0
; CHECK-NEXT: local.set 8
; CHECK-NEXT: local.get 17
; CHECK-NEXT: local.get 17
; CHECK-NEXT: local.get 13
; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.tee 13
; CHECK-NEXT: local.get 13
; CHECK-NEXT: local.get 13
; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: i32x4.extract_lane 0
; CHECK-NEXT: local.set 7
; CHECK-NEXT: local.get 5
; CHECK-NEXT: local.get 11
; CHECK-NEXT: i32.eq
; CHECK-NEXT: br_if 1 # 1: down to label0
; CHECK-NEXT: .LBB0_5: # %scalar.ph
; CHECK-NEXT: end_block # label1:
; CHECK-NEXT: local.get 5
; CHECK-NEXT: local.get 11
; CHECK-NEXT: i32.sub
; CHECK-NEXT: local.set 18
; CHECK-NEXT: local.get 4
; CHECK-NEXT: local.get 11
; CHECK-NEXT: i32.const 2
; CHECK-NEXT: i32.shl
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 3
; CHECK-NEXT: .LBB0_6: # %bb2053.loop
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: loop # label4:
; CHECK-NEXT: local.get 2
; CHECK-NEXT: local.get 3
; CHECK-NEXT: i32.load 0
; CHECK-NEXT: local.tee 19
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.tee 20
; CHECK-NEXT: i32.load8_s 0
; CHECK-NEXT: local.tee 21
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i32.const 1
; CHECK-NEXT: i32.add
; CHECK-NEXT: i32.load8_s 0
; CHECK-NEXT: local.tee 22
; CHECK-NEXT: i32.mul
; CHECK-NEXT: local.get 10
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.get 20
; CHECK-NEXT: local.get 6
; CHECK-NEXT: i32.add
; CHECK-NEXT: i32.load8_s 0
; CHECK-NEXT: local.tee 20
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i32.const 3
; CHECK-NEXT: i32.add
; CHECK-NEXT: i32.load8_s 0
; CHECK-NEXT: local.tee 23
; CHECK-NEXT: i32.mul
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 10
; CHECK-NEXT: local.get 22
; CHECK-NEXT: local.get 1
; CHECK-NEXT: local.get 19
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.tee 19
; CHECK-NEXT: i32.load8_s 0
; CHECK-NEXT: local.tee 11
; CHECK-NEXT: i32.mul
; CHECK-NEXT: local.get 9
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.get 23
; CHECK-NEXT: local.get 19
; CHECK-NEXT: local.get 6
; CHECK-NEXT: i32.add
; CHECK-NEXT: i32.load8_s 0
; CHECK-NEXT: local.tee 19
; CHECK-NEXT: i32.mul
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 9
; CHECK-NEXT: local.get 21
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i32.load8_s 0
; CHECK-NEXT: local.tee 22
; CHECK-NEXT: i32.mul
; CHECK-NEXT: local.get 8
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.get 20
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i32.const 2
; CHECK-NEXT: i32.add
; CHECK-NEXT: i32.load8_s 0
; CHECK-NEXT: local.tee 21
; CHECK-NEXT: i32.mul
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 8
; CHECK-NEXT: local.get 22
; CHECK-NEXT: local.get 11
; CHECK-NEXT: i32.mul
; CHECK-NEXT: local.get 7
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.get 21
; CHECK-NEXT: local.get 19
; CHECK-NEXT: i32.mul
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 7
; CHECK-NEXT: local.get 3
; CHECK-NEXT: i32.const 4
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 3
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i32.const 4
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 12
; CHECK-NEXT: local.get 18
; CHECK-NEXT: i32.const -1
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.tee 18
; CHECK-NEXT: br_if 0 # 0: up to label4
; CHECK-NEXT: .LBB0_7: # %bb2053.exit
; CHECK-NEXT: end_loop
; CHECK-NEXT: end_block # label0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 10
; CHECK-NEXT: i32.store 12
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 9
; CHECK-NEXT: i32.store 8
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 8
; CHECK-NEXT: i32.store 4
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 7
; CHECK-NEXT: i32.store 0
; CHECK-NEXT: # fallthrough-return
;
; MAX-BANDWIDTH-LABEL: bb2053_inner_loop:
; MAX-BANDWIDTH: .functype bb2053_inner_loop (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
; MAX-BANDWIDTH-NEXT: .local i32, i32, v128, v128, v128, v128, v128, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, v128, v128, v128, v128, v128, v128, v128, v128
; MAX-BANDWIDTH-NEXT: # %bb.0: # %entry
; MAX-BANDWIDTH-NEXT: i32.const 0
; MAX-BANDWIDTH-NEXT: local.set 11
; MAX-BANDWIDTH-NEXT: block
; MAX-BANDWIDTH-NEXT: block
; MAX-BANDWIDTH-NEXT: block
; MAX-BANDWIDTH-NEXT: local.get 5
; MAX-BANDWIDTH-NEXT: i32.const 16
; MAX-BANDWIDTH-NEXT: i32.ge_u
; MAX-BANDWIDTH-NEXT: br_if 0 # 0: down to label2
; MAX-BANDWIDTH-NEXT: # %bb.1:
; MAX-BANDWIDTH-NEXT: local.get 3
; MAX-BANDWIDTH-NEXT: local.set 12
; MAX-BANDWIDTH-NEXT: br 1 # 1: down to label1
; MAX-BANDWIDTH-NEXT: .LBB0_2: # %vector.ph
; MAX-BANDWIDTH-NEXT: end_block # label2:
; MAX-BANDWIDTH-NEXT: v128.const 0, 0, 0, 0
; MAX-BANDWIDTH-NEXT: local.tee 13
; MAX-BANDWIDTH-NEXT: local.get 10
; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; MAX-BANDWIDTH-NEXT: local.set 14
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: local.get 9
; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; MAX-BANDWIDTH-NEXT: local.set 15
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: local.get 8
; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; MAX-BANDWIDTH-NEXT: local.set 16
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: local.get 7
; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; MAX-BANDWIDTH-NEXT: local.set 17
; MAX-BANDWIDTH-NEXT: local.get 3
; MAX-BANDWIDTH-NEXT: local.get 5
; MAX-BANDWIDTH-NEXT: i32.const -16
; MAX-BANDWIDTH-NEXT: i32.and
; MAX-BANDWIDTH-NEXT: local.tee 11
; MAX-BANDWIDTH-NEXT: i32.const 2
; MAX-BANDWIDTH-NEXT: i32.shl
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 12
; MAX-BANDWIDTH-NEXT: local.get 4
; MAX-BANDWIDTH-NEXT: local.set 10
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: local.set 9
; MAX-BANDWIDTH-NEXT: .LBB0_3: # %vector.body
; MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1
; MAX-BANDWIDTH-NEXT: loop # label3:
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: local.get 10
; MAX-BANDWIDTH-NEXT: v128.load 48:p2align=2
; MAX-BANDWIDTH-NEXT: local.tee 13
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 3
; MAX-BANDWIDTH-NEXT: local.tee 8
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 7
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 2
; MAX-BANDWIDTH-NEXT: local.tee 18
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 19
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 1
; MAX-BANDWIDTH-NEXT: local.tee 20
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 21
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; MAX-BANDWIDTH-NEXT: local.tee 22
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 23
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: local.get 10
; MAX-BANDWIDTH-NEXT: v128.load 32:p2align=2
; MAX-BANDWIDTH-NEXT: local.tee 13
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 3
; MAX-BANDWIDTH-NEXT: local.tee 24
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 25
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 2
; MAX-BANDWIDTH-NEXT: local.tee 26
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 27
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 1
; MAX-BANDWIDTH-NEXT: local.tee 28
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 29
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; MAX-BANDWIDTH-NEXT: local.tee 30
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 31
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: local.get 10
; MAX-BANDWIDTH-NEXT: v128.load 16:p2align=2
; MAX-BANDWIDTH-NEXT: local.tee 13
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 3
; MAX-BANDWIDTH-NEXT: local.tee 32
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 33
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 2
; MAX-BANDWIDTH-NEXT: local.tee 34
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 35
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 1
; MAX-BANDWIDTH-NEXT: local.tee 36
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 37
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; MAX-BANDWIDTH-NEXT: local.tee 38
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 39
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: local.get 10
; MAX-BANDWIDTH-NEXT: v128.load 0:p2align=2
; MAX-BANDWIDTH-NEXT: local.tee 13
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 3
; MAX-BANDWIDTH-NEXT: local.tee 40
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 41
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 2
; MAX-BANDWIDTH-NEXT: local.tee 42
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 43
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 1
; MAX-BANDWIDTH-NEXT: local.tee 44
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 45
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; MAX-BANDWIDTH-NEXT: local.tee 46
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 47
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: v128.load8_splat 0
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 1
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 2
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 3
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 4
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 5
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 6
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 7
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 8
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 9
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 10
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 11
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 12
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 13
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 14
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 15
; MAX-BANDWIDTH-NEXT: local.tee 13
; MAX-BANDWIDTH-NEXT: local.get 3
; MAX-BANDWIDTH-NEXT: v128.load 0:p2align=0
; MAX-BANDWIDTH-NEXT: local.tee 48
; MAX-BANDWIDTH-NEXT: local.get 3
; MAX-BANDWIDTH-NEXT: v128.load 16:p2align=0
; MAX-BANDWIDTH-NEXT: local.tee 49
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
; MAX-BANDWIDTH-NEXT: local.get 3
; MAX-BANDWIDTH-NEXT: v128.load 32:p2align=0
; MAX-BANDWIDTH-NEXT: local.tee 50
; MAX-BANDWIDTH-NEXT: local.get 3
; MAX-BANDWIDTH-NEXT: v128.load 48:p2align=0
; MAX-BANDWIDTH-NEXT: local.tee 51
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 3, 7, 11, 15, 19, 23, 27, 31
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; MAX-BANDWIDTH-NEXT: local.tee 52
; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: local.get 52
; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.get 7
; MAX-BANDWIDTH-NEXT: local.get 19
; MAX-BANDWIDTH-NEXT: local.get 21
; MAX-BANDWIDTH-NEXT: local.get 23
; MAX-BANDWIDTH-NEXT: local.get 25
; MAX-BANDWIDTH-NEXT: local.get 27
; MAX-BANDWIDTH-NEXT: local.get 29
; MAX-BANDWIDTH-NEXT: local.get 31
; MAX-BANDWIDTH-NEXT: local.get 33
; MAX-BANDWIDTH-NEXT: local.get 35
; MAX-BANDWIDTH-NEXT: local.get 37
; MAX-BANDWIDTH-NEXT: local.get 39
; MAX-BANDWIDTH-NEXT: local.get 41
; MAX-BANDWIDTH-NEXT: local.get 43
; MAX-BANDWIDTH-NEXT: local.get 45
; MAX-BANDWIDTH-NEXT: local.get 47
; MAX-BANDWIDTH-NEXT: v128.load8_splat 0
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 1
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 2
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 3
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 4
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 5
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 6
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 7
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 8
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 9
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 10
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 11
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 12
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 13
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 14
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 15
; MAX-BANDWIDTH-NEXT: local.tee 53
; MAX-BANDWIDTH-NEXT: local.get 48
; MAX-BANDWIDTH-NEXT: local.get 49
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
; MAX-BANDWIDTH-NEXT: local.get 50
; MAX-BANDWIDTH-NEXT: local.get 51
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 9, 13, 17, 21, 25, 29
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; MAX-BANDWIDTH-NEXT: local.tee 54
; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: local.get 53
; MAX-BANDWIDTH-NEXT: local.get 54
; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.get 14
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.set 14
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: local.get 48
; MAX-BANDWIDTH-NEXT: local.get 49
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
; MAX-BANDWIDTH-NEXT: local.get 50
; MAX-BANDWIDTH-NEXT: local.get 51
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 10, 14, 18, 22, 26, 30
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; MAX-BANDWIDTH-NEXT: local.tee 55
; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: local.get 55
; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.get 53
; MAX-BANDWIDTH-NEXT: local.get 48
; MAX-BANDWIDTH-NEXT: local.get 49
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
; MAX-BANDWIDTH-NEXT: local.get 50
; MAX-BANDWIDTH-NEXT: local.get 51
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 8, 12, 16, 20, 24, 28
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; MAX-BANDWIDTH-NEXT: local.tee 13
; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: local.get 53
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.get 16
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.set 16
; MAX-BANDWIDTH-NEXT: local.get 52
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.get 8
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 8
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.get 18
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 7
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.get 20
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 18
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.get 22
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 19
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.get 24
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 20
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.get 26
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 21
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.get 28
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 22
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.get 30
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 23
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.get 32
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 24
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.get 34
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 25
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.get 36
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 26
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.get 38
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 27
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.get 40
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 28
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.get 42
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 29
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.get 44
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 30
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.get 46
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 31
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: v128.load8_splat 0
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 1
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 2
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 3
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 4
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 5
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 6
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 7
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 8
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 9
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 10
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 11
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 12
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 13
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 14
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 15
; MAX-BANDWIDTH-NEXT: local.tee 48
; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: local.get 52
; MAX-BANDWIDTH-NEXT: local.get 48
; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.get 54
; MAX-BANDWIDTH-NEXT: local.get 8
; MAX-BANDWIDTH-NEXT: local.get 7
; MAX-BANDWIDTH-NEXT: local.get 18
; MAX-BANDWIDTH-NEXT: local.get 19
; MAX-BANDWIDTH-NEXT: local.get 20
; MAX-BANDWIDTH-NEXT: local.get 21
; MAX-BANDWIDTH-NEXT: local.get 22
; MAX-BANDWIDTH-NEXT: local.get 23
; MAX-BANDWIDTH-NEXT: local.get 24
; MAX-BANDWIDTH-NEXT: local.get 25
; MAX-BANDWIDTH-NEXT: local.get 26
; MAX-BANDWIDTH-NEXT: local.get 27
; MAX-BANDWIDTH-NEXT: local.get 28
; MAX-BANDWIDTH-NEXT: local.get 29
; MAX-BANDWIDTH-NEXT: local.get 30
; MAX-BANDWIDTH-NEXT: local.get 31
; MAX-BANDWIDTH-NEXT: v128.load8_splat 0
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 1
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 2
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 3
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 4
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 5
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 6
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 7
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 8
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 9
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 10
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 11
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 12
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 13
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 14
; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 15
; MAX-BANDWIDTH-NEXT: local.tee 49
; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: local.get 54
; MAX-BANDWIDTH-NEXT: local.get 49
; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.get 15
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.set 15
; MAX-BANDWIDTH-NEXT: local.get 55
; MAX-BANDWIDTH-NEXT: local.get 48
; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: local.get 55
; MAX-BANDWIDTH-NEXT: local.get 48
; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: local.get 49
; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: local.get 49
; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.get 17
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.set 17
; MAX-BANDWIDTH-NEXT: local.get 3
; MAX-BANDWIDTH-NEXT: i32.const 64
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 3
; MAX-BANDWIDTH-NEXT: local.get 10
; MAX-BANDWIDTH-NEXT: i32.const 64
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 10
; MAX-BANDWIDTH-NEXT: local.get 9
; MAX-BANDWIDTH-NEXT: i32.const -16
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 9
; MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label3
; MAX-BANDWIDTH-NEXT: # %bb.4: # %middle.block
; MAX-BANDWIDTH-NEXT: end_loop
; MAX-BANDWIDTH-NEXT: local.get 14
; MAX-BANDWIDTH-NEXT: local.get 14
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.tee 13
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; MAX-BANDWIDTH-NEXT: local.set 10
; MAX-BANDWIDTH-NEXT: local.get 15
; MAX-BANDWIDTH-NEXT: local.get 15
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.tee 13
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; MAX-BANDWIDTH-NEXT: local.set 9
; MAX-BANDWIDTH-NEXT: local.get 16
; MAX-BANDWIDTH-NEXT: local.get 16
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.tee 13
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; MAX-BANDWIDTH-NEXT: local.set 8
; MAX-BANDWIDTH-NEXT: local.get 17
; MAX-BANDWIDTH-NEXT: local.get 17
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.tee 13
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; MAX-BANDWIDTH-NEXT: local.set 7
; MAX-BANDWIDTH-NEXT: local.get 5
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: i32.eq
; MAX-BANDWIDTH-NEXT: br_if 1 # 1: down to label0
; MAX-BANDWIDTH-NEXT: .LBB0_5: # %scalar.ph
; MAX-BANDWIDTH-NEXT: end_block # label1:
; MAX-BANDWIDTH-NEXT: local.get 5
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: i32.sub
; MAX-BANDWIDTH-NEXT: local.set 18
; MAX-BANDWIDTH-NEXT: local.get 4
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: i32.const 2
; MAX-BANDWIDTH-NEXT: i32.shl
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 3
; MAX-BANDWIDTH-NEXT: .LBB0_6: # %bb2053.loop
; MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1
; MAX-BANDWIDTH-NEXT: loop # label4:
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: local.get 3
; MAX-BANDWIDTH-NEXT: i32.load 0
; MAX-BANDWIDTH-NEXT: local.tee 19
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 20
; MAX-BANDWIDTH-NEXT: i32.load8_s 0
; MAX-BANDWIDTH-NEXT: local.tee 21
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: i32.const 1
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: i32.load8_s 0
; MAX-BANDWIDTH-NEXT: local.tee 22
; MAX-BANDWIDTH-NEXT: i32.mul
; MAX-BANDWIDTH-NEXT: local.get 10
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 20
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: i32.load8_s 0
; MAX-BANDWIDTH-NEXT: local.tee 20
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: i32.const 3
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: i32.load8_s 0
; MAX-BANDWIDTH-NEXT: local.tee 23
; MAX-BANDWIDTH-NEXT: i32.mul
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 10
; MAX-BANDWIDTH-NEXT: local.get 22
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.get 19
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 19
; MAX-BANDWIDTH-NEXT: i32.load8_s 0
; MAX-BANDWIDTH-NEXT: local.tee 24
; MAX-BANDWIDTH-NEXT: i32.mul
; MAX-BANDWIDTH-NEXT: local.get 9
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 23
; MAX-BANDWIDTH-NEXT: local.get 19
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: i32.load8_s 0
; MAX-BANDWIDTH-NEXT: local.tee 19
; MAX-BANDWIDTH-NEXT: i32.mul
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 9
; MAX-BANDWIDTH-NEXT: local.get 21
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: i32.load8_s 0
; MAX-BANDWIDTH-NEXT: local.tee 22
; MAX-BANDWIDTH-NEXT: i32.mul
; MAX-BANDWIDTH-NEXT: local.get 8
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 20
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: i32.const 2
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: i32.load8_s 0
; MAX-BANDWIDTH-NEXT: local.tee 21
; MAX-BANDWIDTH-NEXT: i32.mul
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 8
; MAX-BANDWIDTH-NEXT: local.get 22
; MAX-BANDWIDTH-NEXT: local.get 24
; MAX-BANDWIDTH-NEXT: i32.mul
; MAX-BANDWIDTH-NEXT: local.get 7
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.get 21
; MAX-BANDWIDTH-NEXT: local.get 19
; MAX-BANDWIDTH-NEXT: i32.mul
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 7
; MAX-BANDWIDTH-NEXT: local.get 3
; MAX-BANDWIDTH-NEXT: i32.const 4
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 3
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: i32.const 4
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 12
; MAX-BANDWIDTH-NEXT: local.get 18
; MAX-BANDWIDTH-NEXT: i32.const -1
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 18
; MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label4
; MAX-BANDWIDTH-NEXT: .LBB0_7: # %bb2053.exit
; MAX-BANDWIDTH-NEXT: end_loop
; MAX-BANDWIDTH-NEXT: end_block # label0:
; MAX-BANDWIDTH-NEXT: local.get 0
; MAX-BANDWIDTH-NEXT: local.get 10
; MAX-BANDWIDTH-NEXT: i32.store 12
; MAX-BANDWIDTH-NEXT: local.get 0
; MAX-BANDWIDTH-NEXT: local.get 9
; MAX-BANDWIDTH-NEXT: i32.store 8
; MAX-BANDWIDTH-NEXT: local.get 0
; MAX-BANDWIDTH-NEXT: local.get 8
; MAX-BANDWIDTH-NEXT: i32.store 4
; MAX-BANDWIDTH-NEXT: local.get 0
; MAX-BANDWIDTH-NEXT: local.get 7
; MAX-BANDWIDTH-NEXT: i32.store 0
; MAX-BANDWIDTH-NEXT: # fallthrough-return
;
; RELAXED-MAX-BANDWIDTH-LABEL: bb2053_inner_loop:
; RELAXED-MAX-BANDWIDTH: .functype bb2053_inner_loop (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
; RELAXED-MAX-BANDWIDTH-NEXT: .local i32, i32, v128, v128, v128, v128, v128, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, v128, v128, v128, v128, v128, v128, v128, v128
; RELAXED-MAX-BANDWIDTH-NEXT: # %bb.0: # %entry
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 11
; RELAXED-MAX-BANDWIDTH-NEXT: block
; RELAXED-MAX-BANDWIDTH-NEXT: block
; RELAXED-MAX-BANDWIDTH-NEXT: block
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 5
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 16
; RELAXED-MAX-BANDWIDTH-NEXT: i32.ge_u
; RELAXED-MAX-BANDWIDTH-NEXT: br_if 0 # 0: down to label2
; RELAXED-MAX-BANDWIDTH-NEXT: # %bb.1:
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 12
; RELAXED-MAX-BANDWIDTH-NEXT: br 1 # 1: down to label1
; RELAXED-MAX-BANDWIDTH-NEXT: .LBB0_2: # %vector.ph
; RELAXED-MAX-BANDWIDTH-NEXT: end_block # label2:
; RELAXED-MAX-BANDWIDTH-NEXT: v128.const 0, 0, 0, 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 14
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 15
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 16
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 17
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 5
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const -16
; RELAXED-MAX-BANDWIDTH-NEXT: i32.and
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 11
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 2
; RELAXED-MAX-BANDWIDTH-NEXT: i32.shl
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 4
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 10
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 9
; RELAXED-MAX-BANDWIDTH-NEXT: .LBB0_3: # %vector.body
; RELAXED-MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1
; RELAXED-MAX-BANDWIDTH-NEXT: loop # label3:
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 48:p2align=2
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 3
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 8
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 7
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 18
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 19
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 20
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 21
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 22
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 23
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 32:p2align=2
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 3
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 24
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 25
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 26
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 27
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 28
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 29
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 30
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 31
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 16:p2align=2
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 3
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 32
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 33
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 34
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 35
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 36
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 37
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 38
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 39
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 0:p2align=2
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 3
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 40
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 41
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 42
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 43
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 44
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 45
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 46
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 47
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_splat 0
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 1
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 2
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 3
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 4
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 5
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 6
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 7
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 8
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 9
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 10
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 11
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 12
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 13
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 14
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 15
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 48
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 0:p2align=0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 16:p2align=0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 49
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 32:p2align=0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 50
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 48:p2align=0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 51
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 3, 7, 11, 15, 19, 23, 27, 31
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 52
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 19
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 21
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 23
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 25
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 27
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 29
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 31
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 33
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 35
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 37
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 39
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 41
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 43
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 45
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 47
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_splat 0
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 1
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 2
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 3
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 4
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 5
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 6
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 7
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 8
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 9
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 10
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 11
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 12
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 13
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 14
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 15
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 53
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 49
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 50
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 51
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 9, 13, 17, 21, 25, 29
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 54
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 14
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 14
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 48
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 49
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 50
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 51
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 10, 14, 18, 22, 26, 30
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 55
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 53
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 49
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 50
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 51
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 8, 12, 16, 20, 24, 28
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 16
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 16
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 52
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 8
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 18
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 7
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 20
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 18
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 22
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 19
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 24
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 20
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 26
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 21
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 28
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 22
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 30
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 23
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 32
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 24
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 34
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 25
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 36
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 26
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 38
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 27
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 40
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 28
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 42
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 29
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 44
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 30
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 46
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 31
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_splat 0
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 1
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 2
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 3
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 4
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 5
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 6
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 7
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 8
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 9
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 10
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 11
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 12
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 13
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 14
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 15
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 49
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 54
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 18
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 19
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 20
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 21
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 22
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 23
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 24
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 25
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 26
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 27
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 28
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 29
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 30
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 31
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_splat 0
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 1
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 2
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 3
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 4
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 5
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 6
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 7
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 8
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 9
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 10
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 11
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 12
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 13
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 14
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 15
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 50
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 15
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 15
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 55
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 49
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 50
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 17
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 17
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 64
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 3
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 64
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 10
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const -16
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 9
; RELAXED-MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label3
; RELAXED-MAX-BANDWIDTH-NEXT: # %bb.4: # %middle.block
; RELAXED-MAX-BANDWIDTH-NEXT: end_loop
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 14
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 14
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 10
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 15
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 15
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 9
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 16
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 16
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 8
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 17
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 17
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 7
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 5
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11
; RELAXED-MAX-BANDWIDTH-NEXT: i32.eq
; RELAXED-MAX-BANDWIDTH-NEXT: br_if 1 # 1: down to label0
; RELAXED-MAX-BANDWIDTH-NEXT: .LBB0_5: # %scalar.ph
; RELAXED-MAX-BANDWIDTH-NEXT: end_block # label1:
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 5
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11
; RELAXED-MAX-BANDWIDTH-NEXT: i32.sub
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 18
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 4
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 2
; RELAXED-MAX-BANDWIDTH-NEXT: i32.shl
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 3
; RELAXED-MAX-BANDWIDTH-NEXT: .LBB0_6: # %bb2053.loop
; RELAXED-MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1
; RELAXED-MAX-BANDWIDTH-NEXT: loop # label4:
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32.load 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 19
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 20
; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 21
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 1
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 22
; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 20
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 20
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 23
; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 10
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 22
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 19
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 19
; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 24
; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 23
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 19
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 19
; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 9
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 21
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 22
; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 20
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 2
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 21
; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 8
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 22
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 24
; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 21
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 19
; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 7
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 4
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 3
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 4
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 18
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const -1
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 18
; RELAXED-MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label4
; RELAXED-MAX-BANDWIDTH-NEXT: .LBB0_7: # %bb2053.exit
; RELAXED-MAX-BANDWIDTH-NEXT: end_loop
; RELAXED-MAX-BANDWIDTH-NEXT: end_block # label0:
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10
; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9
; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 8
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8
; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 4
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7
; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 0
; RELAXED-MAX-BANDWIDTH-NEXT: # fallthrough-return
entry:
br label %bb2053.loop
bb2053.loop:
%idx = phi i32 [ 0, %entry ], [ %idx.next, %bb2053.loop ]
%accA = phi i32 [ %acc0, %entry ], [ %accA.sum, %bb2053.loop ]
%accB = phi i32 [ %acc1, %entry ], [ %accB.sum, %bb2053.loop ]
%accC = phi i32 [ %acc2, %entry ], [ %accC.sum, %bb2053.loop ]
%accD = phi i32 [ %acc3, %entry ], [ %accD.sum, %bb2053.loop ]
%wptr = phi ptr [ %weights, %entry ], [ %wptr.next, %bb2053.loop ]
%idx.ptr = getelementptr inbounds nuw i32, ptr %indices, i32 %idx
%idx.val = load i32, ptr %idx.ptr, align 4
%lhs0.ptr = getelementptr inbounds i8, ptr %base0, i32 %idx.val
%rhs0.ptr = getelementptr inbounds i8, ptr %base1, i32 %idx.val
%lhs0 = load i8, ptr %lhs0.ptr, align 1
%lhs0.sext = sext i8 %lhs0 to i32
%w0 = load i8, ptr %wptr, align 1
%w0.sext = sext i8 %w0 to i32
%mul0 = mul nsw i32 %w0.sext, %lhs0.sext
%accA.next = add nsw i32 %mul0, %accA
%w1.ptr = getelementptr inbounds nuw i8, ptr %wptr, i32 1
%w1 = load i8, ptr %w1.ptr, align 1
%w1.sext = sext i8 %w1 to i32
%mul1 = mul nsw i32 %w1.sext, %lhs0.sext
%accC.next = add nsw i32 %mul1, %accC
%lhs1.ptr = getelementptr inbounds nuw i8, ptr %lhs0.ptr, i32 %stride
%lhs1 = load i8, ptr %lhs1.ptr, align 1
%lhs1.sext = sext i8 %lhs1 to i32
%w2.ptr = getelementptr inbounds nuw i8, ptr %wptr, i32 2
%w2 = load i8, ptr %w2.ptr, align 1
%w2.sext = sext i8 %w2 to i32
%mul2 = mul nsw i32 %w2.sext, %lhs1.sext
%accA.sum = add nsw i32 %accA.next, %mul2
%w3.ptr = getelementptr inbounds nuw i8, ptr %wptr, i32 3
%w3 = load i8, ptr %w3.ptr, align 1
%w3.sext = sext i8 %w3 to i32
%mul3 = mul nsw i32 %w3.sext, %lhs1.sext
%accC.sum = add nsw i32 %accC.next, %mul3
%rhs0 = load i8, ptr %rhs0.ptr, align 1
%rhs0.sext = sext i8 %rhs0 to i32
%mul4 = mul nsw i32 %rhs0.sext, %w0.sext
%accB.next = add nsw i32 %mul4, %accB
%mul5 = mul nsw i32 %rhs0.sext, %w1.sext
%accD.next = add nsw i32 %mul5, %accD
%rhs1.ptr = getelementptr inbounds nuw i8, ptr %rhs0.ptr, i32 %stride
%rhs1 = load i8, ptr %rhs1.ptr, align 1
%rhs1.sext = sext i8 %rhs1 to i32
%mul6 = mul nsw i32 %rhs1.sext, %w2.sext
%accB.sum = add nsw i32 %accB.next, %mul6
%mul7 = mul nsw i32 %rhs1.sext, %w3.sext
%accD.sum = add nsw i32 %accD.next, %mul7
%wptr.next = getelementptr inbounds nuw i8, ptr %wptr, i32 4
%idx.next = add nuw nsw i32 %idx, 1
%exit = icmp eq i32 %idx.next, %len
br i1 %exit, label %bb2053.exit, label %bb2053.loop
bb2053.exit:
%res0 = insertvalue { i32, i32, i32, i32 } poison, i32 %accA.sum, 0
%res1 = insertvalue { i32, i32, i32, i32 } %res0, i32 %accB.sum, 1
%res2 = insertvalue { i32, i32, i32, i32 } %res1, i32 %accC.sum, 2
%res3 = insertvalue { i32, i32, i32, i32 } %res2, i32 %accD.sum, 3
ret { i32, i32, i32, i32 } %res3
}
define hidden { i32, i32, i32, i32 } @bb41_inner_loop(ptr nocapture %lhs, ptr nocapture %rhs, i32 %len, i32 %acc00, i32 %acc01, i32 %acc10, i32 %acc11) local_unnamed_addr {
; CHECK-LABEL: bb41_inner_loop:
; CHECK: .functype bb41_inner_loop (i32, i32, i32, i32, i32, i32, i32, i32) -> ()
; CHECK-NEXT: .local i32, i32, i32, v128, v128, v128, v128, v128, v128, v128, v128, i32
; CHECK-NEXT: # %bb.0: # %entry
; CHECK-NEXT: i32.const 0
; CHECK-NEXT: local.set 8
; CHECK-NEXT: block
; CHECK-NEXT: block
; CHECK-NEXT: block
; CHECK-NEXT: local.get 3
; CHECK-NEXT: i32.const 4
; CHECK-NEXT: i32.ge_u
; CHECK-NEXT: br_if 0 # 0: down to label7
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: local.get 1
; CHECK-NEXT: local.set 9
; CHECK-NEXT: local.get 2
; CHECK-NEXT: local.set 10
; CHECK-NEXT: br 1 # 1: down to label6
; CHECK-NEXT: .LBB1_2: # %vector.ph
; CHECK-NEXT: end_block # label7:
; CHECK-NEXT: v128.const 0, 0, 0, 0
; CHECK-NEXT: local.tee 11
; CHECK-NEXT: local.get 7
; CHECK-NEXT: i32x4.replace_lane 0
; CHECK-NEXT: local.set 12
; CHECK-NEXT: local.get 11
; CHECK-NEXT: local.get 6
; CHECK-NEXT: i32x4.replace_lane 0
; CHECK-NEXT: local.set 13
; CHECK-NEXT: local.get 11
; CHECK-NEXT: local.get 5
; CHECK-NEXT: i32x4.replace_lane 0
; CHECK-NEXT: local.set 14
; CHECK-NEXT: local.get 11
; CHECK-NEXT: local.get 4
; CHECK-NEXT: i32x4.replace_lane 0
; CHECK-NEXT: local.set 11
; CHECK-NEXT: local.get 2
; CHECK-NEXT: local.get 3
; CHECK-NEXT: i32.const -4
; CHECK-NEXT: i32.and
; CHECK-NEXT: local.tee 8
; CHECK-NEXT: i32.const 1
; CHECK-NEXT: i32.shl
; CHECK-NEXT: local.tee 9
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 10
; CHECK-NEXT: local.get 1
; CHECK-NEXT: local.get 9
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 9
; CHECK-NEXT: local.get 8
; CHECK-NEXT: local.set 7
; CHECK-NEXT: .LBB1_3: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: loop # label8:
; CHECK-NEXT: local.get 1
; CHECK-NEXT: v128.load64_zero 0:p2align=0
; CHECK-NEXT: local.tee 15
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: i16x8.extend_low_i8x16_s
; CHECK-NEXT: local.tee 16
; CHECK-NEXT: local.get 2
; CHECK-NEXT: v128.load64_zero 0:p2align=0
; CHECK-NEXT: local.tee 17
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: i16x8.extend_low_i8x16_s
; CHECK-NEXT: local.tee 18
; CHECK-NEXT: i32x4.extmul_low_i16x8_s
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.set 12
; CHECK-NEXT: local.get 16
; CHECK-NEXT: local.get 17
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: i16x8.extend_low_i8x16_s
; CHECK-NEXT: local.tee 17
; CHECK-NEXT: i32x4.extmul_low_i16x8_s
; CHECK-NEXT: local.get 13
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.set 13
; CHECK-NEXT: local.get 18
; CHECK-NEXT: local.get 15
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: i16x8.extend_low_i8x16_s
; CHECK-NEXT: local.tee 15
; CHECK-NEXT: i32x4.extmul_low_i16x8_s
; CHECK-NEXT: local.get 14
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.set 14
; CHECK-NEXT: local.get 17
; CHECK-NEXT: local.get 15
; CHECK-NEXT: i32x4.extmul_low_i16x8_s
; CHECK-NEXT: local.get 11
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.set 11
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i32.const 8
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 1
; CHECK-NEXT: local.get 2
; CHECK-NEXT: i32.const 8
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 2
; CHECK-NEXT: local.get 7
; CHECK-NEXT: i32.const -4
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.tee 7
; CHECK-NEXT: br_if 0 # 0: up to label8
; CHECK-NEXT: # %bb.4: # %middle.block
; CHECK-NEXT: end_loop
; CHECK-NEXT: local.get 12
; CHECK-NEXT: local.get 12
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.tee 12
; CHECK-NEXT: local.get 12
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: i32x4.extract_lane 0
; CHECK-NEXT: local.set 7
; CHECK-NEXT: local.get 13
; CHECK-NEXT: local.get 13
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.tee 12
; CHECK-NEXT: local.get 12
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: i32x4.extract_lane 0
; CHECK-NEXT: local.set 6
; CHECK-NEXT: local.get 14
; CHECK-NEXT: local.get 14
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.tee 12
; CHECK-NEXT: local.get 12
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: i32x4.extract_lane 0
; CHECK-NEXT: local.set 5
; CHECK-NEXT: local.get 11
; CHECK-NEXT: local.get 11
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.tee 12
; CHECK-NEXT: local.get 12
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: i32x4.extract_lane 0
; CHECK-NEXT: local.set 4
; CHECK-NEXT: local.get 3
; CHECK-NEXT: local.get 8
; CHECK-NEXT: i32.eq
; CHECK-NEXT: br_if 1 # 1: down to label5
; CHECK-NEXT: .LBB1_5: # %scalar.ph
; CHECK-NEXT: end_block # label6:
; CHECK-NEXT: local.get 3
; CHECK-NEXT: local.get 8
; CHECK-NEXT: i32.sub
; CHECK-NEXT: local.set 2
; CHECK-NEXT: .LBB1_6: # %bb41
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: loop # label9:
; CHECK-NEXT: local.get 9
; CHECK-NEXT: i32.const 1
; CHECK-NEXT: i32.add
; CHECK-NEXT: i32.load8_s 0
; CHECK-NEXT: local.tee 1
; CHECK-NEXT: local.get 10
; CHECK-NEXT: i32.const 1
; CHECK-NEXT: i32.add
; CHECK-NEXT: i32.load8_s 0
; CHECK-NEXT: local.tee 3
; CHECK-NEXT: i32.mul
; CHECK-NEXT: local.get 7
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 7
; CHECK-NEXT: local.get 10
; CHECK-NEXT: i32.load8_s 0
; CHECK-NEXT: local.tee 8
; CHECK-NEXT: local.get 9
; CHECK-NEXT: i32.load8_s 0
; CHECK-NEXT: local.tee 19
; CHECK-NEXT: i32.mul
; CHECK-NEXT: local.get 4
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 4
; CHECK-NEXT: local.get 1
; CHECK-NEXT: local.get 8
; CHECK-NEXT: i32.mul
; CHECK-NEXT: local.get 6
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 6
; CHECK-NEXT: local.get 3
; CHECK-NEXT: local.get 19
; CHECK-NEXT: i32.mul
; CHECK-NEXT: local.get 5
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 5
; CHECK-NEXT: local.get 9
; CHECK-NEXT: i32.const 2
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 9
; CHECK-NEXT: local.get 10
; CHECK-NEXT: i32.const 2
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 10
; CHECK-NEXT: local.get 2
; CHECK-NEXT: i32.const -1
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.tee 2
; CHECK-NEXT: br_if 0 # 0: up to label9
; CHECK-NEXT: .LBB1_7: # %bb41.exit
; CHECK-NEXT: end_loop
; CHECK-NEXT: end_block # label5:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 7
; CHECK-NEXT: i32.store 12
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 6
; CHECK-NEXT: i32.store 8
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 5
; CHECK-NEXT: i32.store 4
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 4
; CHECK-NEXT: i32.store 0
; CHECK-NEXT: # fallthrough-return
;
; MAX-BANDWIDTH-LABEL: bb41_inner_loop:
; MAX-BANDWIDTH: .functype bb41_inner_loop (i32, i32, i32, i32, i32, i32, i32, i32) -> ()
; MAX-BANDWIDTH-NEXT: .local i32, i32, i32, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, i32
; MAX-BANDWIDTH-NEXT: # %bb.0: # %entry
; MAX-BANDWIDTH-NEXT: i32.const 0
; MAX-BANDWIDTH-NEXT: local.set 8
; MAX-BANDWIDTH-NEXT: block
; MAX-BANDWIDTH-NEXT: block
; MAX-BANDWIDTH-NEXT: block
; MAX-BANDWIDTH-NEXT: local.get 3
; MAX-BANDWIDTH-NEXT: i32.const 16
; MAX-BANDWIDTH-NEXT: i32.ge_u
; MAX-BANDWIDTH-NEXT: br_if 0 # 0: down to label7
; MAX-BANDWIDTH-NEXT: # %bb.1:
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.set 9
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: local.set 10
; MAX-BANDWIDTH-NEXT: br 1 # 1: down to label6
; MAX-BANDWIDTH-NEXT: .LBB1_2: # %vector.ph
; MAX-BANDWIDTH-NEXT: end_block # label7:
; MAX-BANDWIDTH-NEXT: v128.const 0, 0, 0, 0
; MAX-BANDWIDTH-NEXT: local.tee 11
; MAX-BANDWIDTH-NEXT: local.get 7
; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; MAX-BANDWIDTH-NEXT: local.set 12
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; MAX-BANDWIDTH-NEXT: local.set 13
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: local.get 5
; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; MAX-BANDWIDTH-NEXT: local.set 14
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: local.get 4
; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; MAX-BANDWIDTH-NEXT: local.set 15
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: local.get 3
; MAX-BANDWIDTH-NEXT: i32.const -16
; MAX-BANDWIDTH-NEXT: i32.and
; MAX-BANDWIDTH-NEXT: local.tee 8
; MAX-BANDWIDTH-NEXT: i32.const 1
; MAX-BANDWIDTH-NEXT: i32.shl
; MAX-BANDWIDTH-NEXT: local.tee 9
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 10
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.get 9
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 9
; MAX-BANDWIDTH-NEXT: local.get 8
; MAX-BANDWIDTH-NEXT: local.set 7
; MAX-BANDWIDTH-NEXT: .LBB1_3: # %vector.body
; MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1
; MAX-BANDWIDTH-NEXT: loop # label8:
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: v128.load 0:p2align=0
; MAX-BANDWIDTH-NEXT: local.tee 16
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: v128.load 16:p2align=0
; MAX-BANDWIDTH-NEXT: local.tee 17
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
; MAX-BANDWIDTH-NEXT: local.tee 11
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: v128.load 0:p2align=0
; MAX-BANDWIDTH-NEXT: local.tee 18
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: v128.load 16:p2align=0
; MAX-BANDWIDTH-NEXT: local.tee 19
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
; MAX-BANDWIDTH-NEXT: local.tee 20
; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: local.get 20
; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.set 12
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: local.get 18
; MAX-BANDWIDTH-NEXT: local.get 19
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
; MAX-BANDWIDTH-NEXT: local.tee 18
; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: local.get 18
; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.set 13
; MAX-BANDWIDTH-NEXT: local.get 20
; MAX-BANDWIDTH-NEXT: local.get 16
; MAX-BANDWIDTH-NEXT: local.get 17
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
; MAX-BANDWIDTH-NEXT: local.tee 11
; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: local.get 20
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.get 14
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.set 14
; MAX-BANDWIDTH-NEXT: local.get 18
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: local.get 18
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s
; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.get 15
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.set 15
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: i32.const 32
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 1
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: i32.const 32
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 2
; MAX-BANDWIDTH-NEXT: local.get 7
; MAX-BANDWIDTH-NEXT: i32.const -16
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 7
; MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label8
; MAX-BANDWIDTH-NEXT: # %bb.4: # %middle.block
; MAX-BANDWIDTH-NEXT: end_loop
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.tee 11
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; MAX-BANDWIDTH-NEXT: local.set 7
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.tee 11
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; MAX-BANDWIDTH-NEXT: local.set 6
; MAX-BANDWIDTH-NEXT: local.get 14
; MAX-BANDWIDTH-NEXT: local.get 14
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.tee 11
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; MAX-BANDWIDTH-NEXT: local.set 5
; MAX-BANDWIDTH-NEXT: local.get 15
; MAX-BANDWIDTH-NEXT: local.get 15
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.tee 11
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; MAX-BANDWIDTH-NEXT: local.set 4
; MAX-BANDWIDTH-NEXT: local.get 3
; MAX-BANDWIDTH-NEXT: local.get 8
; MAX-BANDWIDTH-NEXT: i32.eq
; MAX-BANDWIDTH-NEXT: br_if 1 # 1: down to label5
; MAX-BANDWIDTH-NEXT: .LBB1_5: # %scalar.ph
; MAX-BANDWIDTH-NEXT: end_block # label6:
; MAX-BANDWIDTH-NEXT: local.get 3
; MAX-BANDWIDTH-NEXT: local.get 8
; MAX-BANDWIDTH-NEXT: i32.sub
; MAX-BANDWIDTH-NEXT: local.set 2
; MAX-BANDWIDTH-NEXT: .LBB1_6: # %bb41
; MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1
; MAX-BANDWIDTH-NEXT: loop # label9:
; MAX-BANDWIDTH-NEXT: local.get 9
; MAX-BANDWIDTH-NEXT: i32.const 1
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: i32.load8_s 0
; MAX-BANDWIDTH-NEXT: local.tee 1
; MAX-BANDWIDTH-NEXT: local.get 10
; MAX-BANDWIDTH-NEXT: i32.const 1
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: i32.load8_s 0
; MAX-BANDWIDTH-NEXT: local.tee 3
; MAX-BANDWIDTH-NEXT: i32.mul
; MAX-BANDWIDTH-NEXT: local.get 7
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 7
; MAX-BANDWIDTH-NEXT: local.get 10
; MAX-BANDWIDTH-NEXT: i32.load8_s 0
; MAX-BANDWIDTH-NEXT: local.tee 8
; MAX-BANDWIDTH-NEXT: local.get 9
; MAX-BANDWIDTH-NEXT: i32.load8_s 0
; MAX-BANDWIDTH-NEXT: local.tee 21
; MAX-BANDWIDTH-NEXT: i32.mul
; MAX-BANDWIDTH-NEXT: local.get 4
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 4
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.get 8
; MAX-BANDWIDTH-NEXT: i32.mul
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 6
; MAX-BANDWIDTH-NEXT: local.get 3
; MAX-BANDWIDTH-NEXT: local.get 21
; MAX-BANDWIDTH-NEXT: i32.mul
; MAX-BANDWIDTH-NEXT: local.get 5
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 5
; MAX-BANDWIDTH-NEXT: local.get 9
; MAX-BANDWIDTH-NEXT: i32.const 2
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 9
; MAX-BANDWIDTH-NEXT: local.get 10
; MAX-BANDWIDTH-NEXT: i32.const 2
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 10
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: i32.const -1
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 2
; MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label9
; MAX-BANDWIDTH-NEXT: .LBB1_7: # %bb41.exit
; MAX-BANDWIDTH-NEXT: end_loop
; MAX-BANDWIDTH-NEXT: end_block # label5:
; MAX-BANDWIDTH-NEXT: local.get 0
; MAX-BANDWIDTH-NEXT: local.get 7
; MAX-BANDWIDTH-NEXT: i32.store 12
; MAX-BANDWIDTH-NEXT: local.get 0
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.store 8
; MAX-BANDWIDTH-NEXT: local.get 0
; MAX-BANDWIDTH-NEXT: local.get 5
; MAX-BANDWIDTH-NEXT: i32.store 4
; MAX-BANDWIDTH-NEXT: local.get 0
; MAX-BANDWIDTH-NEXT: local.get 4
; MAX-BANDWIDTH-NEXT: i32.store 0
; MAX-BANDWIDTH-NEXT: # fallthrough-return
;
; RELAXED-MAX-BANDWIDTH-LABEL: bb41_inner_loop:
; RELAXED-MAX-BANDWIDTH: .functype bb41_inner_loop (i32, i32, i32, i32, i32, i32, i32, i32) -> ()
; RELAXED-MAX-BANDWIDTH-NEXT: .local i32, i32, i32, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, i32
; RELAXED-MAX-BANDWIDTH-NEXT: # %bb.0: # %entry
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 8
; RELAXED-MAX-BANDWIDTH-NEXT: block
; RELAXED-MAX-BANDWIDTH-NEXT: block
; RELAXED-MAX-BANDWIDTH-NEXT: block
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 16
; RELAXED-MAX-BANDWIDTH-NEXT: i32.ge_u
; RELAXED-MAX-BANDWIDTH-NEXT: br_if 0 # 0: down to label7
; RELAXED-MAX-BANDWIDTH-NEXT: # %bb.1:
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 9
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 10
; RELAXED-MAX-BANDWIDTH-NEXT: br 1 # 1: down to label6
; RELAXED-MAX-BANDWIDTH-NEXT: .LBB1_2: # %vector.ph
; RELAXED-MAX-BANDWIDTH-NEXT: end_block # label7:
; RELAXED-MAX-BANDWIDTH-NEXT: v128.const 0, 0, 0, 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 11
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 5
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 14
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 4
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 11
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const -16
; RELAXED-MAX-BANDWIDTH-NEXT: i32.and
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 8
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 1
; RELAXED-MAX-BANDWIDTH-NEXT: i32.shl
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 9
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 10
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 9
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 7
; RELAXED-MAX-BANDWIDTH-NEXT: .LBB1_3: # %vector.body
; RELAXED-MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1
; RELAXED-MAX-BANDWIDTH-NEXT: loop # label8:
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 0:p2align=0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 15
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 16:p2align=0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 16
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 17
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 0:p2align=0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 18
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 16:p2align=0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 19
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 20
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 17
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 18
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 19
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 18
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 20
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 15
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 16
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 15
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 14
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 14
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 18
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 15
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 11
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 32
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 32
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const -16
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 7
; RELAXED-MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label8
; RELAXED-MAX-BANDWIDTH-NEXT: # %bb.4: # %middle.block
; RELAXED-MAX-BANDWIDTH-NEXT: end_loop
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 7
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 6
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 14
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 14
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 5
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 4
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8
; RELAXED-MAX-BANDWIDTH-NEXT: i32.eq
; RELAXED-MAX-BANDWIDTH-NEXT: br_if 1 # 1: down to label5
; RELAXED-MAX-BANDWIDTH-NEXT: .LBB1_5: # %scalar.ph
; RELAXED-MAX-BANDWIDTH-NEXT: end_block # label6:
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8
; RELAXED-MAX-BANDWIDTH-NEXT: i32.sub
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 2
; RELAXED-MAX-BANDWIDTH-NEXT: .LBB1_6: # %bb41
; RELAXED-MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1
; RELAXED-MAX-BANDWIDTH-NEXT: loop # label9:
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 1
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 1
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 7
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10
; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 8
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9
; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 21
; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 4
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 4
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8
; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 6
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 21
; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 5
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 5
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 2
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 9
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 2
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 10
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const -1
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 2
; RELAXED-MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label9
; RELAXED-MAX-BANDWIDTH-NEXT: .LBB1_7: # %bb41.exit
; RELAXED-MAX-BANDWIDTH-NEXT: end_loop
; RELAXED-MAX-BANDWIDTH-NEXT: end_block # label5:
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7
; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 8
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 5
; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 4
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 4
; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 0
; RELAXED-MAX-BANDWIDTH-NEXT: # fallthrough-return
entry:
br label %bb41
bb41:
%idx = phi i32 [ 0, %entry ], [ %idx.next, %bb41 ]
%lhs.ptr = phi ptr [ %lhs, %entry ], [ %lhs.next, %bb41 ]
%rhs.ptr = phi ptr [ %rhs, %entry ], [ %rhs.next, %bb41 ]
%acc00.phi = phi i32 [ %acc00, %entry ], [ %acc00.next, %bb41 ]
%acc01.phi = phi i32 [ %acc01, %entry ], [ %acc01.next, %bb41 ]
%acc10.phi = phi i32 [ %acc10, %entry ], [ %acc10.next, %bb41 ]
%acc11.phi = phi i32 [ %acc11, %entry ], [ %acc11.next, %bb41 ]
%lhs0 = load i8, ptr %lhs.ptr, align 1
%lhs0.sext = sext i8 %lhs0 to i32
%rhs0 = load i8, ptr %rhs.ptr, align 1
%rhs0.sext = sext i8 %rhs0 to i32
%mul00 = mul nsw i32 %rhs0.sext, %lhs0.sext
%acc00.next = add nsw i32 %mul00, %acc00.phi
%rhs1.ptr = getelementptr inbounds nuw i8, ptr %rhs.ptr, i32 1
%rhs1 = load i8, ptr %rhs1.ptr, align 1
%rhs1.sext = sext i8 %rhs1 to i32
%mul01 = mul nsw i32 %rhs1.sext, %lhs0.sext
%acc01.next = add nsw i32 %mul01, %acc01.phi
%lhs1.ptr = getelementptr inbounds nuw i8, ptr %lhs.ptr, i32 1
%lhs1 = load i8, ptr %lhs1.ptr, align 1
%lhs1.sext = sext i8 %lhs1 to i32
%mul10 = mul nsw i32 %lhs1.sext, %rhs0.sext
%acc10.next = add nsw i32 %mul10, %acc10.phi
%mul11 = mul nsw i32 %lhs1.sext, %rhs1.sext
%acc11.next = add nsw i32 %mul11, %acc11.phi
%lhs.next = getelementptr inbounds nuw i8, ptr %lhs.ptr, i32 2
%rhs.next = getelementptr inbounds nuw i8, ptr %rhs.ptr, i32 2
%idx.next = add nuw nsw i32 %idx, 1
%exit = icmp eq i32 %idx.next, %len
br i1 %exit, label %bb41.exit, label %bb41
bb41.exit:
%res0 = insertvalue { i32, i32, i32, i32 } poison, i32 %acc00.next, 0
%res1 = insertvalue { i32, i32, i32, i32 } %res0, i32 %acc01.next, 1
%res2 = insertvalue { i32, i32, i32, i32 } %res1, i32 %acc10.next, 2
%res3 = insertvalue { i32, i32, i32, i32 } %res2, i32 %acc11.next, 3
ret { i32, i32, i32, i32 } %res3
}
define hidden { i32, i32, i32, i32 } @bb41_inner_loop_i16(ptr nocapture %lhs, ptr nocapture %rhs, i32 %len, i32 %acc00, i32 %acc01, i32 %acc10, i32 %acc11) local_unnamed_addr {
; CHECK-LABEL: bb41_inner_loop_i16:
; CHECK: .functype bb41_inner_loop_i16 (i32, i32, i32, i32, i32, i32, i32, i32) -> ()
; CHECK-NEXT: .local i32, i32, i32, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, i32
; CHECK-NEXT: # %bb.0: # %entry
; CHECK-NEXT: i32.const 0
; CHECK-NEXT: local.set 8
; CHECK-NEXT: block
; CHECK-NEXT: block
; CHECK-NEXT: local.get 3
; CHECK-NEXT: i32.const 5
; CHECK-NEXT: i32.ge_u
; CHECK-NEXT: br_if 0 # 0: down to label11
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: local.get 1
; CHECK-NEXT: local.set 9
; CHECK-NEXT: local.get 2
; CHECK-NEXT: local.set 10
; CHECK-NEXT: br 1 # 1: down to label10
; CHECK-NEXT: .LBB2_2: # %vector.ph
; CHECK-NEXT: end_block # label11:
; CHECK-NEXT: v128.const 0, 0, 0, 0
; CHECK-NEXT: local.tee 11
; CHECK-NEXT: local.get 7
; CHECK-NEXT: i32x4.replace_lane 0
; CHECK-NEXT: local.set 12
; CHECK-NEXT: local.get 11
; CHECK-NEXT: local.get 6
; CHECK-NEXT: i32x4.replace_lane 0
; CHECK-NEXT: local.set 13
; CHECK-NEXT: local.get 11
; CHECK-NEXT: local.get 5
; CHECK-NEXT: i32x4.replace_lane 0
; CHECK-NEXT: local.set 14
; CHECK-NEXT: local.get 11
; CHECK-NEXT: local.get 4
; CHECK-NEXT: i32x4.replace_lane 0
; CHECK-NEXT: local.set 11
; CHECK-NEXT: local.get 2
; CHECK-NEXT: local.get 3
; CHECK-NEXT: local.get 3
; CHECK-NEXT: i32.const 3
; CHECK-NEXT: i32.and
; CHECK-NEXT: local.tee 9
; CHECK-NEXT: i32.const 4
; CHECK-NEXT: local.get 9
; CHECK-NEXT: i32.select
; CHECK-NEXT: i32.sub
; CHECK-NEXT: local.tee 8
; CHECK-NEXT: i32.const 3
; CHECK-NEXT: i32.shl
; CHECK-NEXT: local.tee 9
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 10
; CHECK-NEXT: local.get 1
; CHECK-NEXT: local.get 9
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 9
; CHECK-NEXT: local.get 8
; CHECK-NEXT: local.set 7
; CHECK-NEXT: .LBB2_3: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: loop # label12:
; CHECK-NEXT: local.get 1
; CHECK-NEXT: v128.load 0:p2align=1
; CHECK-NEXT: local.tee 15
; CHECK-NEXT: local.get 1
; CHECK-NEXT: v128.load 16:p2align=1
; CHECK-NEXT: local.tee 16
; CHECK-NEXT: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
; CHECK-NEXT: local.tee 17
; CHECK-NEXT: local.get 2
; CHECK-NEXT: v128.load 0:p2align=1
; CHECK-NEXT: local.tee 18
; CHECK-NEXT: local.get 2
; CHECK-NEXT: v128.load 16:p2align=1
; CHECK-NEXT: local.tee 19
; CHECK-NEXT: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
; CHECK-NEXT: local.tee 20
; CHECK-NEXT: i32x4.extmul_low_i16x8_s
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.set 12
; CHECK-NEXT: local.get 17
; CHECK-NEXT: local.get 18
; CHECK-NEXT: local.get 19
; CHECK-NEXT: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
; CHECK-NEXT: local.tee 18
; CHECK-NEXT: i32x4.extmul_low_i16x8_s
; CHECK-NEXT: local.get 13
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.set 13
; CHECK-NEXT: local.get 20
; CHECK-NEXT: local.get 15
; CHECK-NEXT: local.get 16
; CHECK-NEXT: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
; CHECK-NEXT: local.tee 15
; CHECK-NEXT: i32x4.extmul_low_i16x8_s
; CHECK-NEXT: local.get 14
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.set 14
; CHECK-NEXT: local.get 18
; CHECK-NEXT: local.get 15
; CHECK-NEXT: i32x4.extmul_low_i16x8_s
; CHECK-NEXT: local.get 11
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.set 11
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i32.const 32
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 1
; CHECK-NEXT: local.get 2
; CHECK-NEXT: i32.const 32
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 2
; CHECK-NEXT: local.get 7
; CHECK-NEXT: i32.const -4
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.tee 7
; CHECK-NEXT: br_if 0 # 0: up to label12
; CHECK-NEXT: # %bb.4: # %middle.block
; CHECK-NEXT: end_loop
; CHECK-NEXT: local.get 12
; CHECK-NEXT: local.get 12
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.tee 12
; CHECK-NEXT: local.get 12
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: i32x4.extract_lane 0
; CHECK-NEXT: local.set 7
; CHECK-NEXT: local.get 13
; CHECK-NEXT: local.get 13
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.tee 12
; CHECK-NEXT: local.get 12
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: i32x4.extract_lane 0
; CHECK-NEXT: local.set 6
; CHECK-NEXT: local.get 14
; CHECK-NEXT: local.get 14
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.tee 12
; CHECK-NEXT: local.get 12
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: i32x4.extract_lane 0
; CHECK-NEXT: local.set 5
; CHECK-NEXT: local.get 11
; CHECK-NEXT: local.get 11
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: local.tee 12
; CHECK-NEXT: local.get 12
; CHECK-NEXT: local.get 12
; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: i32x4.add
; CHECK-NEXT: i32x4.extract_lane 0
; CHECK-NEXT: local.set 4
; CHECK-NEXT: .LBB2_5: # %scalar.ph
; CHECK-NEXT: end_block # label10:
; CHECK-NEXT: local.get 3
; CHECK-NEXT: local.get 8
; CHECK-NEXT: i32.sub
; CHECK-NEXT: local.set 2
; CHECK-NEXT: .LBB2_6: # %bb41
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: loop # label13:
; CHECK-NEXT: local.get 9
; CHECK-NEXT: i32.const 4
; CHECK-NEXT: i32.add
; CHECK-NEXT: i32.load16_s 0
; CHECK-NEXT: local.tee 1
; CHECK-NEXT: local.get 10
; CHECK-NEXT: i32.const 4
; CHECK-NEXT: i32.add
; CHECK-NEXT: i32.load16_s 0
; CHECK-NEXT: local.tee 3
; CHECK-NEXT: i32.mul
; CHECK-NEXT: local.get 7
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 7
; CHECK-NEXT: local.get 10
; CHECK-NEXT: i32.load16_s 0
; CHECK-NEXT: local.tee 8
; CHECK-NEXT: local.get 9
; CHECK-NEXT: i32.load16_s 0
; CHECK-NEXT: local.tee 21
; CHECK-NEXT: i32.mul
; CHECK-NEXT: local.get 4
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 4
; CHECK-NEXT: local.get 1
; CHECK-NEXT: local.get 8
; CHECK-NEXT: i32.mul
; CHECK-NEXT: local.get 6
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 6
; CHECK-NEXT: local.get 3
; CHECK-NEXT: local.get 21
; CHECK-NEXT: i32.mul
; CHECK-NEXT: local.get 5
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 5
; CHECK-NEXT: local.get 9
; CHECK-NEXT: i32.const 8
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 9
; CHECK-NEXT: local.get 10
; CHECK-NEXT: i32.const 8
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.set 10
; CHECK-NEXT: local.get 2
; CHECK-NEXT: i32.const -1
; CHECK-NEXT: i32.add
; CHECK-NEXT: local.tee 2
; CHECK-NEXT: br_if 0 # 0: up to label13
; CHECK-NEXT: # %bb.7: # %bb41.exit
; CHECK-NEXT: end_loop
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 7
; CHECK-NEXT: i32.store 12
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 6
; CHECK-NEXT: i32.store 8
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 5
; CHECK-NEXT: i32.store 4
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 4
; CHECK-NEXT: i32.store 0
; CHECK-NEXT: # fallthrough-return
;
; MAX-BANDWIDTH-LABEL: bb41_inner_loop_i16:
; MAX-BANDWIDTH: .functype bb41_inner_loop_i16 (i32, i32, i32, i32, i32, i32, i32, i32) -> ()
; MAX-BANDWIDTH-NEXT: .local i32, i32, i32, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, i32
; MAX-BANDWIDTH-NEXT: # %bb.0: # %entry
; MAX-BANDWIDTH-NEXT: i32.const 0
; MAX-BANDWIDTH-NEXT: local.set 8
; MAX-BANDWIDTH-NEXT: block
; MAX-BANDWIDTH-NEXT: block
; MAX-BANDWIDTH-NEXT: local.get 3
; MAX-BANDWIDTH-NEXT: i32.const 9
; MAX-BANDWIDTH-NEXT: i32.ge_u
; MAX-BANDWIDTH-NEXT: br_if 0 # 0: down to label11
; MAX-BANDWIDTH-NEXT: # %bb.1:
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.set 9
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: local.set 10
; MAX-BANDWIDTH-NEXT: br 1 # 1: down to label10
; MAX-BANDWIDTH-NEXT: .LBB2_2: # %vector.ph
; MAX-BANDWIDTH-NEXT: end_block # label11:
; MAX-BANDWIDTH-NEXT: v128.const 0, 0, 0, 0
; MAX-BANDWIDTH-NEXT: local.tee 11
; MAX-BANDWIDTH-NEXT: local.get 7
; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; MAX-BANDWIDTH-NEXT: local.set 12
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; MAX-BANDWIDTH-NEXT: local.set 13
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: local.get 5
; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; MAX-BANDWIDTH-NEXT: local.set 14
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: local.get 4
; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; MAX-BANDWIDTH-NEXT: local.set 11
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: local.get 3
; MAX-BANDWIDTH-NEXT: local.get 3
; MAX-BANDWIDTH-NEXT: i32.const 7
; MAX-BANDWIDTH-NEXT: i32.and
; MAX-BANDWIDTH-NEXT: local.tee 9
; MAX-BANDWIDTH-NEXT: i32.const 8
; MAX-BANDWIDTH-NEXT: local.get 9
; MAX-BANDWIDTH-NEXT: i32.select
; MAX-BANDWIDTH-NEXT: i32.sub
; MAX-BANDWIDTH-NEXT: local.tee 8
; MAX-BANDWIDTH-NEXT: i32.const 3
; MAX-BANDWIDTH-NEXT: i32.shl
; MAX-BANDWIDTH-NEXT: local.tee 9
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 10
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.get 9
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 9
; MAX-BANDWIDTH-NEXT: local.get 8
; MAX-BANDWIDTH-NEXT: local.set 7
; MAX-BANDWIDTH-NEXT: .LBB2_3: # %vector.body
; MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1
; MAX-BANDWIDTH-NEXT: loop # label12:
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: v128.load 0:p2align=1
; MAX-BANDWIDTH-NEXT: local.tee 15
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: v128.load 16:p2align=1
; MAX-BANDWIDTH-NEXT: local.tee 16
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: v128.load 32:p2align=1
; MAX-BANDWIDTH-NEXT: local.tee 17
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: v128.load 48:p2align=1
; MAX-BANDWIDTH-NEXT: local.tee 18
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 12, 13, 20, 21, 28, 29
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; MAX-BANDWIDTH-NEXT: local.tee 19
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: v128.load 0:p2align=1
; MAX-BANDWIDTH-NEXT: local.tee 20
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: v128.load 16:p2align=1
; MAX-BANDWIDTH-NEXT: local.tee 21
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: v128.load 32:p2align=1
; MAX-BANDWIDTH-NEXT: local.tee 22
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: v128.load 48:p2align=1
; MAX-BANDWIDTH-NEXT: local.tee 23
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 12, 13, 20, 21, 28, 29
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; MAX-BANDWIDTH-NEXT: local.tee 24
; MAX-BANDWIDTH-NEXT: i32x4.dot_i16x8_s
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.set 12
; MAX-BANDWIDTH-NEXT: local.get 19
; MAX-BANDWIDTH-NEXT: local.get 20
; MAX-BANDWIDTH-NEXT: local.get 21
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
; MAX-BANDWIDTH-NEXT: local.get 22
; MAX-BANDWIDTH-NEXT: local.get 23
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 16, 17, 24, 25
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; MAX-BANDWIDTH-NEXT: local.tee 20
; MAX-BANDWIDTH-NEXT: i32x4.dot_i16x8_s
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.set 13
; MAX-BANDWIDTH-NEXT: local.get 24
; MAX-BANDWIDTH-NEXT: local.get 15
; MAX-BANDWIDTH-NEXT: local.get 16
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
; MAX-BANDWIDTH-NEXT: local.get 17
; MAX-BANDWIDTH-NEXT: local.get 18
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 16, 17, 24, 25
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; MAX-BANDWIDTH-NEXT: local.tee 15
; MAX-BANDWIDTH-NEXT: i32x4.dot_i16x8_s
; MAX-BANDWIDTH-NEXT: local.get 14
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.set 14
; MAX-BANDWIDTH-NEXT: local.get 20
; MAX-BANDWIDTH-NEXT: local.get 15
; MAX-BANDWIDTH-NEXT: i32x4.dot_i16x8_s
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.set 11
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: i32.const 64
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 1
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: i32.const 64
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 2
; MAX-BANDWIDTH-NEXT: local.get 7
; MAX-BANDWIDTH-NEXT: i32.const -8
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 7
; MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label12
; MAX-BANDWIDTH-NEXT: # %bb.4: # %middle.block
; MAX-BANDWIDTH-NEXT: end_loop
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.tee 12
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; MAX-BANDWIDTH-NEXT: local.set 7
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: local.get 13
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.tee 12
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; MAX-BANDWIDTH-NEXT: local.set 6
; MAX-BANDWIDTH-NEXT: local.get 14
; MAX-BANDWIDTH-NEXT: local.get 14
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.tee 12
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; MAX-BANDWIDTH-NEXT: local.set 5
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: local.get 11
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: local.tee 12
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: local.get 12
; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; MAX-BANDWIDTH-NEXT: i32x4.add
; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; MAX-BANDWIDTH-NEXT: local.set 4
; MAX-BANDWIDTH-NEXT: .LBB2_5: # %scalar.ph
; MAX-BANDWIDTH-NEXT: end_block # label10:
; MAX-BANDWIDTH-NEXT: local.get 3
; MAX-BANDWIDTH-NEXT: local.get 8
; MAX-BANDWIDTH-NEXT: i32.sub
; MAX-BANDWIDTH-NEXT: local.set 2
; MAX-BANDWIDTH-NEXT: .LBB2_6: # %bb41
; MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1
; MAX-BANDWIDTH-NEXT: loop # label13:
; MAX-BANDWIDTH-NEXT: local.get 9
; MAX-BANDWIDTH-NEXT: i32.const 4
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: i32.load16_s 0
; MAX-BANDWIDTH-NEXT: local.tee 1
; MAX-BANDWIDTH-NEXT: local.get 10
; MAX-BANDWIDTH-NEXT: i32.const 4
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: i32.load16_s 0
; MAX-BANDWIDTH-NEXT: local.tee 3
; MAX-BANDWIDTH-NEXT: i32.mul
; MAX-BANDWIDTH-NEXT: local.get 7
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 7
; MAX-BANDWIDTH-NEXT: local.get 10
; MAX-BANDWIDTH-NEXT: i32.load16_s 0
; MAX-BANDWIDTH-NEXT: local.tee 8
; MAX-BANDWIDTH-NEXT: local.get 9
; MAX-BANDWIDTH-NEXT: i32.load16_s 0
; MAX-BANDWIDTH-NEXT: local.tee 25
; MAX-BANDWIDTH-NEXT: i32.mul
; MAX-BANDWIDTH-NEXT: local.get 4
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 4
; MAX-BANDWIDTH-NEXT: local.get 1
; MAX-BANDWIDTH-NEXT: local.get 8
; MAX-BANDWIDTH-NEXT: i32.mul
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 6
; MAX-BANDWIDTH-NEXT: local.get 3
; MAX-BANDWIDTH-NEXT: local.get 25
; MAX-BANDWIDTH-NEXT: i32.mul
; MAX-BANDWIDTH-NEXT: local.get 5
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 5
; MAX-BANDWIDTH-NEXT: local.get 9
; MAX-BANDWIDTH-NEXT: i32.const 8
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 9
; MAX-BANDWIDTH-NEXT: local.get 10
; MAX-BANDWIDTH-NEXT: i32.const 8
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.set 10
; MAX-BANDWIDTH-NEXT: local.get 2
; MAX-BANDWIDTH-NEXT: i32.const -1
; MAX-BANDWIDTH-NEXT: i32.add
; MAX-BANDWIDTH-NEXT: local.tee 2
; MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label13
; MAX-BANDWIDTH-NEXT: # %bb.7: # %bb41.exit
; MAX-BANDWIDTH-NEXT: end_loop
; MAX-BANDWIDTH-NEXT: local.get 0
; MAX-BANDWIDTH-NEXT: local.get 7
; MAX-BANDWIDTH-NEXT: i32.store 12
; MAX-BANDWIDTH-NEXT: local.get 0
; MAX-BANDWIDTH-NEXT: local.get 6
; MAX-BANDWIDTH-NEXT: i32.store 8
; MAX-BANDWIDTH-NEXT: local.get 0
; MAX-BANDWIDTH-NEXT: local.get 5
; MAX-BANDWIDTH-NEXT: i32.store 4
; MAX-BANDWIDTH-NEXT: local.get 0
; MAX-BANDWIDTH-NEXT: local.get 4
; MAX-BANDWIDTH-NEXT: i32.store 0
; MAX-BANDWIDTH-NEXT: # fallthrough-return
;
; RELAXED-MAX-BANDWIDTH-LABEL: bb41_inner_loop_i16:
; RELAXED-MAX-BANDWIDTH: .functype bb41_inner_loop_i16 (i32, i32, i32, i32, i32, i32, i32, i32) -> ()
; RELAXED-MAX-BANDWIDTH-NEXT: .local i32, i32, i32, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, i32
; RELAXED-MAX-BANDWIDTH-NEXT: # %bb.0: # %entry
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 8
; RELAXED-MAX-BANDWIDTH-NEXT: block
; RELAXED-MAX-BANDWIDTH-NEXT: block
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 9
; RELAXED-MAX-BANDWIDTH-NEXT: i32.ge_u
; RELAXED-MAX-BANDWIDTH-NEXT: br_if 0 # 0: down to label11
; RELAXED-MAX-BANDWIDTH-NEXT: # %bb.1:
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 9
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 10
; RELAXED-MAX-BANDWIDTH-NEXT: br 1 # 1: down to label10
; RELAXED-MAX-BANDWIDTH-NEXT: .LBB2_2: # %vector.ph
; RELAXED-MAX-BANDWIDTH-NEXT: end_block # label11:
; RELAXED-MAX-BANDWIDTH-NEXT: v128.const 0, 0, 0, 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 11
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 5
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 14
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 4
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 11
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 7
; RELAXED-MAX-BANDWIDTH-NEXT: i32.and
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 9
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 8
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9
; RELAXED-MAX-BANDWIDTH-NEXT: i32.select
; RELAXED-MAX-BANDWIDTH-NEXT: i32.sub
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 8
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32.shl
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 9
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 10
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 9
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 7
; RELAXED-MAX-BANDWIDTH-NEXT: .LBB2_3: # %vector.body
; RELAXED-MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1
; RELAXED-MAX-BANDWIDTH-NEXT: loop # label12:
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 0:p2align=1
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 15
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 16:p2align=1
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 16
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 32:p2align=1
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 17
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 48:p2align=1
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 18
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 12, 13, 20, 21, 28, 29
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 19
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 0:p2align=1
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 20
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 16:p2align=1
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 21
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 32:p2align=1
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 22
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 48:p2align=1
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 23
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 12, 13, 20, 21, 28, 29
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 24
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.dot_i16x8_s
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 19
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 20
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 21
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 22
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 23
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 16, 17, 24, 25
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 20
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.dot_i16x8_s
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 24
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 15
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 16
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 17
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 18
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 16, 17, 24, 25
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 15
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.dot_i16x8_s
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 14
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 14
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 20
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 15
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.dot_i16x8_s
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 11
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 64
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 64
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 2
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const -8
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 7
; RELAXED-MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label12
; RELAXED-MAX-BANDWIDTH-NEXT: # %bb.4: # %middle.block
; RELAXED-MAX-BANDWIDTH-NEXT: end_loop
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 7
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 6
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 14
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 14
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 5
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12
; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add
; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 4
; RELAXED-MAX-BANDWIDTH-NEXT: .LBB2_5: # %scalar.ph
; RELAXED-MAX-BANDWIDTH-NEXT: end_block # label10:
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8
; RELAXED-MAX-BANDWIDTH-NEXT: i32.sub
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 2
; RELAXED-MAX-BANDWIDTH-NEXT: .LBB2_6: # %bb41
; RELAXED-MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1
; RELAXED-MAX-BANDWIDTH-NEXT: loop # label13:
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 4
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: i32.load16_s 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 4
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: i32.load16_s 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 3
; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 7
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10
; RELAXED-MAX-BANDWIDTH-NEXT: i32.load16_s 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 8
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9
; RELAXED-MAX-BANDWIDTH-NEXT: i32.load16_s 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 25
; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 4
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 4
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8
; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 6
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 25
; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 5
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 5
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 8
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 9
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 8
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.set 10
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2
; RELAXED-MAX-BANDWIDTH-NEXT: i32.const -1
; RELAXED-MAX-BANDWIDTH-NEXT: i32.add
; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 2
; RELAXED-MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label13
; RELAXED-MAX-BANDWIDTH-NEXT: # %bb.7: # %bb41.exit
; RELAXED-MAX-BANDWIDTH-NEXT: end_loop
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7
; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 12
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6
; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 8
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 5
; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 4
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0
; RELAXED-MAX-BANDWIDTH-NEXT: local.get 4
; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 0
; RELAXED-MAX-BANDWIDTH-NEXT: # fallthrough-return
entry:
br label %bb41
bb41:
%idx = phi i32 [ 0, %entry ], [ %idx.next, %bb41 ]
%lhs.ptr = phi ptr [ %lhs, %entry ], [ %lhs.next, %bb41 ]
%rhs.ptr = phi ptr [ %rhs, %entry ], [ %rhs.next, %bb41 ]
%acc00.phi = phi i32 [ %acc00, %entry ], [ %acc00.next, %bb41 ]
%acc01.phi = phi i32 [ %acc01, %entry ], [ %acc01.next, %bb41 ]
%acc10.phi = phi i32 [ %acc10, %entry ], [ %acc10.next, %bb41 ]
%acc11.phi = phi i32 [ %acc11, %entry ], [ %acc11.next, %bb41 ]
%lhs0 = load i16, ptr %lhs.ptr, align 2
%lhs0.sext = sext i16 %lhs0 to i32
%rhs0 = load i16, ptr %rhs.ptr, align 2
%rhs0.sext = sext i16 %rhs0 to i32
%mul00 = mul nsw i32 %rhs0.sext, %lhs0.sext
%acc00.next = add nsw i32 %mul00, %acc00.phi
%rhs1.ptr = getelementptr inbounds nuw i16, ptr %rhs.ptr, i32 2
%rhs1 = load i16, ptr %rhs1.ptr, align 2
%rhs1.sext = sext i16 %rhs1 to i32
%mul01 = mul nsw i32 %rhs1.sext, %lhs0.sext
%acc01.next = add nsw i32 %mul01, %acc01.phi
%lhs1.ptr = getelementptr inbounds nuw i16, ptr %lhs.ptr, i32 2
%lhs1 = load i16, ptr %lhs1.ptr, align 2
%lhs1.sext = sext i16 %lhs1 to i32
%mul10 = mul nsw i32 %lhs1.sext, %rhs0.sext
%acc10.next = add nsw i32 %mul10, %acc10.phi
%mul11 = mul nsw i32 %lhs1.sext, %rhs1.sext
%acc11.next = add nsw i32 %mul11, %acc11.phi
%lhs.next = getelementptr inbounds nuw i16, ptr %lhs.ptr, i32 4
%rhs.next = getelementptr inbounds nuw i16, ptr %rhs.ptr, i32 4
%idx.next = add nuw nsw i32 %idx, 1
%exit = icmp eq i32 %idx.next, %len
br i1 %exit, label %bb41.exit, label %bb41
bb41.exit:
%res0 = insertvalue { i32, i32, i32, i32 } poison, i32 %acc00.next, 0
%res1 = insertvalue { i32, i32, i32, i32 } %res0, i32 %acc01.next, 1
%res2 = insertvalue { i32, i32, i32, i32 } %res1, i32 %acc10.next, 2
%res3 = insertvalue { i32, i32, i32, i32 } %res2, i32 %acc11.next, 3
ret { i32, i32, i32, i32 } %res3
}