| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 |
| ; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s |
| ; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s --check-prefix=MAX-BANDWIDTH |
| ; RUN: opt -mattr=+simd128,+relaxed-simd -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128,+relaxed-simd -verify-machineinstrs -o - | FileCheck %s --check-prefix=RELAXED-MAX-BANDWIDTH |
| |
| target triple = "wasm32" |
| |
| define hidden { i32, i32, i32, i32 } @bb2053_inner_loop(ptr nocapture %base0, ptr nocapture %base1, ptr nocapture %weights, ptr nocapture readonly %indices, i32 %len, i32 %stride, i32 %acc0, i32 %acc1, i32 %acc2, i32 %acc3) local_unnamed_addr { |
| ; CHECK-LABEL: bb2053_inner_loop: |
| ; CHECK: .functype bb2053_inner_loop (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> () |
| ; CHECK-NEXT: .local i32, i32, v128, v128, v128, v128, v128, i32, i32, i32, i32, i32, i32, v128, v128, v128, v128, v128 |
| ; CHECK-NEXT: # %bb.0: # %entry |
| ; CHECK-NEXT: i32.const 0 |
| ; CHECK-NEXT: local.set 11 |
| ; CHECK-NEXT: block |
| ; CHECK-NEXT: block |
| ; CHECK-NEXT: block |
| ; CHECK-NEXT: local.get 5 |
| ; CHECK-NEXT: i32.const 4 |
| ; CHECK-NEXT: i32.ge_u |
| ; CHECK-NEXT: br_if 0 # 0: down to label2 |
| ; CHECK-NEXT: # %bb.1: |
| ; CHECK-NEXT: local.get 3 |
| ; CHECK-NEXT: local.set 12 |
| ; CHECK-NEXT: br 1 # 1: down to label1 |
| ; CHECK-NEXT: .LBB0_2: # %vector.ph |
| ; CHECK-NEXT: end_block # label2: |
| ; CHECK-NEXT: v128.const 0, 0, 0, 0 |
| ; CHECK-NEXT: local.tee 13 |
| ; CHECK-NEXT: local.get 10 |
| ; CHECK-NEXT: i32x4.replace_lane 0 |
| ; CHECK-NEXT: local.set 14 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: local.get 9 |
| ; CHECK-NEXT: i32x4.replace_lane 0 |
| ; CHECK-NEXT: local.set 15 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: local.get 8 |
| ; CHECK-NEXT: i32x4.replace_lane 0 |
| ; CHECK-NEXT: local.set 16 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: local.get 7 |
| ; CHECK-NEXT: i32x4.replace_lane 0 |
| ; CHECK-NEXT: local.set 17 |
| ; CHECK-NEXT: local.get 3 |
| ; CHECK-NEXT: local.get 5 |
| ; CHECK-NEXT: i32.const -4 |
| ; CHECK-NEXT: i32.and |
| ; CHECK-NEXT: local.tee 11 |
| ; CHECK-NEXT: i32.const 2 |
| ; CHECK-NEXT: i32.shl |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 12 |
| ; CHECK-NEXT: local.get 4 |
| ; CHECK-NEXT: local.set 10 |
| ; CHECK-NEXT: local.get 11 |
| ; CHECK-NEXT: local.set 9 |
| ; CHECK-NEXT: .LBB0_3: # %vector.body |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: loop # label3: |
| ; CHECK-NEXT: local.get 2 |
| ; CHECK-NEXT: local.get 10 |
| ; CHECK-NEXT: v128.load 0:p2align=2 |
| ; CHECK-NEXT: local.tee 13 |
| ; CHECK-NEXT: i32x4.extract_lane 3 |
| ; CHECK-NEXT: local.tee 8 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.tee 7 |
| ; CHECK-NEXT: local.get 2 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: i32x4.extract_lane 2 |
| ; CHECK-NEXT: local.tee 18 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.tee 19 |
| ; CHECK-NEXT: local.get 2 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: i32x4.extract_lane 1 |
| ; CHECK-NEXT: local.tee 20 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.tee 21 |
| ; CHECK-NEXT: local.get 2 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: i32x4.extract_lane 0 |
| ; CHECK-NEXT: local.tee 22 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.tee 23 |
| ; CHECK-NEXT: v128.load8_splat 0 |
| ; CHECK-NEXT: v128.load8_lane 0, 1 |
| ; CHECK-NEXT: v128.load8_lane 0, 2 |
| ; CHECK-NEXT: v128.load8_lane 0, 3 |
| ; CHECK-NEXT: i16x8.extend_low_i8x16_s |
| ; CHECK-NEXT: local.tee 24 |
| ; CHECK-NEXT: local.get 3 |
| ; CHECK-NEXT: v128.load 0:p2align=0 |
| ; CHECK-NEXT: local.tee 13 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| ; CHECK-NEXT: i16x8.extend_low_i8x16_s |
| ; CHECK-NEXT: local.tee 25 |
| ; CHECK-NEXT: i32x4.extmul_low_i16x8_s |
| ; CHECK-NEXT: local.get 14 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.get 7 |
| ; CHECK-NEXT: local.get 6 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.get 19 |
| ; CHECK-NEXT: local.get 6 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.get 21 |
| ; CHECK-NEXT: local.get 6 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.get 23 |
| ; CHECK-NEXT: local.get 6 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: v128.load8_splat 0 |
| ; CHECK-NEXT: v128.load8_lane 0, 1 |
| ; CHECK-NEXT: v128.load8_lane 0, 2 |
| ; CHECK-NEXT: v128.load8_lane 0, 3 |
| ; CHECK-NEXT: i16x8.extend_low_i8x16_s |
| ; CHECK-NEXT: local.tee 26 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: i8x16.shuffle 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| ; CHECK-NEXT: i16x8.extend_low_i8x16_s |
| ; CHECK-NEXT: local.tee 27 |
| ; CHECK-NEXT: i32x4.extmul_low_i16x8_s |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.set 14 |
| ; CHECK-NEXT: local.get 24 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| ; CHECK-NEXT: i16x8.extend_low_i8x16_s |
| ; CHECK-NEXT: local.tee 28 |
| ; CHECK-NEXT: i32x4.extmul_low_i16x8_s |
| ; CHECK-NEXT: local.get 16 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.get 26 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| ; CHECK-NEXT: i16x8.extend_low_i8x16_s |
| ; CHECK-NEXT: local.tee 13 |
| ; CHECK-NEXT: i32x4.extmul_low_i16x8_s |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.set 16 |
| ; CHECK-NEXT: local.get 25 |
| ; CHECK-NEXT: local.get 1 |
| ; CHECK-NEXT: local.get 8 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.tee 8 |
| ; CHECK-NEXT: local.get 1 |
| ; CHECK-NEXT: local.get 18 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.tee 7 |
| ; CHECK-NEXT: local.get 1 |
| ; CHECK-NEXT: local.get 20 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.tee 18 |
| ; CHECK-NEXT: local.get 1 |
| ; CHECK-NEXT: local.get 22 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.tee 19 |
| ; CHECK-NEXT: v128.load8_splat 0 |
| ; CHECK-NEXT: v128.load8_lane 0, 1 |
| ; CHECK-NEXT: v128.load8_lane 0, 2 |
| ; CHECK-NEXT: v128.load8_lane 0, 3 |
| ; CHECK-NEXT: i16x8.extend_low_i8x16_s |
| ; CHECK-NEXT: local.tee 24 |
| ; CHECK-NEXT: i32x4.extmul_low_i16x8_s |
| ; CHECK-NEXT: local.get 15 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.get 27 |
| ; CHECK-NEXT: local.get 8 |
| ; CHECK-NEXT: local.get 6 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.get 7 |
| ; CHECK-NEXT: local.get 6 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.get 18 |
| ; CHECK-NEXT: local.get 6 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.get 19 |
| ; CHECK-NEXT: local.get 6 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: v128.load8_splat 0 |
| ; CHECK-NEXT: v128.load8_lane 0, 1 |
| ; CHECK-NEXT: v128.load8_lane 0, 2 |
| ; CHECK-NEXT: v128.load8_lane 0, 3 |
| ; CHECK-NEXT: i16x8.extend_low_i8x16_s |
| ; CHECK-NEXT: local.tee 25 |
| ; CHECK-NEXT: i32x4.extmul_low_i16x8_s |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.set 15 |
| ; CHECK-NEXT: local.get 28 |
| ; CHECK-NEXT: local.get 24 |
| ; CHECK-NEXT: i32x4.extmul_low_i16x8_s |
| ; CHECK-NEXT: local.get 17 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: local.get 25 |
| ; CHECK-NEXT: i32x4.extmul_low_i16x8_s |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.set 17 |
| ; CHECK-NEXT: local.get 3 |
| ; CHECK-NEXT: i32.const 16 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 3 |
| ; CHECK-NEXT: local.get 10 |
| ; CHECK-NEXT: i32.const 16 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 10 |
| ; CHECK-NEXT: local.get 9 |
| ; CHECK-NEXT: i32.const -4 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.tee 9 |
| ; CHECK-NEXT: br_if 0 # 0: up to label3 |
| ; CHECK-NEXT: # %bb.4: # %middle.block |
| ; CHECK-NEXT: end_loop |
| ; CHECK-NEXT: local.get 14 |
| ; CHECK-NEXT: local.get 14 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.tee 13 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: i32x4.extract_lane 0 |
| ; CHECK-NEXT: local.set 10 |
| ; CHECK-NEXT: local.get 15 |
| ; CHECK-NEXT: local.get 15 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.tee 13 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: i32x4.extract_lane 0 |
| ; CHECK-NEXT: local.set 9 |
| ; CHECK-NEXT: local.get 16 |
| ; CHECK-NEXT: local.get 16 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.tee 13 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: i32x4.extract_lane 0 |
| ; CHECK-NEXT: local.set 8 |
| ; CHECK-NEXT: local.get 17 |
| ; CHECK-NEXT: local.get 17 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.tee 13 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: i32x4.extract_lane 0 |
| ; CHECK-NEXT: local.set 7 |
| ; CHECK-NEXT: local.get 5 |
| ; CHECK-NEXT: local.get 11 |
| ; CHECK-NEXT: i32.eq |
| ; CHECK-NEXT: br_if 1 # 1: down to label0 |
| ; CHECK-NEXT: .LBB0_5: # %scalar.ph |
| ; CHECK-NEXT: end_block # label1: |
| ; CHECK-NEXT: local.get 5 |
| ; CHECK-NEXT: local.get 11 |
| ; CHECK-NEXT: i32.sub |
| ; CHECK-NEXT: local.set 18 |
| ; CHECK-NEXT: local.get 4 |
| ; CHECK-NEXT: local.get 11 |
| ; CHECK-NEXT: i32.const 2 |
| ; CHECK-NEXT: i32.shl |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 3 |
| ; CHECK-NEXT: .LBB0_6: # %bb2053.loop |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: loop # label4: |
| ; CHECK-NEXT: local.get 2 |
| ; CHECK-NEXT: local.get 3 |
| ; CHECK-NEXT: i32.load 0 |
| ; CHECK-NEXT: local.tee 19 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.tee 20 |
| ; CHECK-NEXT: i32.load8_s 0 |
| ; CHECK-NEXT: local.tee 21 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i32.const 1 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: i32.load8_s 0 |
| ; CHECK-NEXT: local.tee 22 |
| ; CHECK-NEXT: i32.mul |
| ; CHECK-NEXT: local.get 10 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.get 20 |
| ; CHECK-NEXT: local.get 6 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: i32.load8_s 0 |
| ; CHECK-NEXT: local.tee 20 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i32.const 3 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: i32.load8_s 0 |
| ; CHECK-NEXT: local.tee 23 |
| ; CHECK-NEXT: i32.mul |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 10 |
| ; CHECK-NEXT: local.get 22 |
| ; CHECK-NEXT: local.get 1 |
| ; CHECK-NEXT: local.get 19 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.tee 19 |
| ; CHECK-NEXT: i32.load8_s 0 |
| ; CHECK-NEXT: local.tee 11 |
| ; CHECK-NEXT: i32.mul |
| ; CHECK-NEXT: local.get 9 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.get 23 |
| ; CHECK-NEXT: local.get 19 |
| ; CHECK-NEXT: local.get 6 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: i32.load8_s 0 |
| ; CHECK-NEXT: local.tee 19 |
| ; CHECK-NEXT: i32.mul |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 9 |
| ; CHECK-NEXT: local.get 21 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i32.load8_s 0 |
| ; CHECK-NEXT: local.tee 22 |
| ; CHECK-NEXT: i32.mul |
| ; CHECK-NEXT: local.get 8 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.get 20 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i32.const 2 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: i32.load8_s 0 |
| ; CHECK-NEXT: local.tee 21 |
| ; CHECK-NEXT: i32.mul |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 8 |
| ; CHECK-NEXT: local.get 22 |
| ; CHECK-NEXT: local.get 11 |
| ; CHECK-NEXT: i32.mul |
| ; CHECK-NEXT: local.get 7 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.get 21 |
| ; CHECK-NEXT: local.get 19 |
| ; CHECK-NEXT: i32.mul |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 7 |
| ; CHECK-NEXT: local.get 3 |
| ; CHECK-NEXT: i32.const 4 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 3 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i32.const 4 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 12 |
| ; CHECK-NEXT: local.get 18 |
| ; CHECK-NEXT: i32.const -1 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.tee 18 |
| ; CHECK-NEXT: br_if 0 # 0: up to label4 |
| ; CHECK-NEXT: .LBB0_7: # %bb2053.exit |
| ; CHECK-NEXT: end_loop |
| ; CHECK-NEXT: end_block # label0: |
| ; CHECK-NEXT: local.get 0 |
| ; CHECK-NEXT: local.get 10 |
| ; CHECK-NEXT: i32.store 12 |
| ; CHECK-NEXT: local.get 0 |
| ; CHECK-NEXT: local.get 9 |
| ; CHECK-NEXT: i32.store 8 |
| ; CHECK-NEXT: local.get 0 |
| ; CHECK-NEXT: local.get 8 |
| ; CHECK-NEXT: i32.store 4 |
| ; CHECK-NEXT: local.get 0 |
| ; CHECK-NEXT: local.get 7 |
| ; CHECK-NEXT: i32.store 0 |
| ; CHECK-NEXT: # fallthrough-return |
| ; |
| ; MAX-BANDWIDTH-LABEL: bb2053_inner_loop: |
| ; MAX-BANDWIDTH: .functype bb2053_inner_loop (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> () |
| ; MAX-BANDWIDTH-NEXT: .local i32, i32, v128, v128, v128, v128, v128, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, v128, v128, v128, v128, v128, v128, v128, v128 |
| ; MAX-BANDWIDTH-NEXT: # %bb.0: # %entry |
| ; MAX-BANDWIDTH-NEXT: i32.const 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 11 |
| ; MAX-BANDWIDTH-NEXT: block |
| ; MAX-BANDWIDTH-NEXT: block |
| ; MAX-BANDWIDTH-NEXT: block |
| ; MAX-BANDWIDTH-NEXT: local.get 5 |
| ; MAX-BANDWIDTH-NEXT: i32.const 16 |
| ; MAX-BANDWIDTH-NEXT: i32.ge_u |
| ; MAX-BANDWIDTH-NEXT: br_if 0 # 0: down to label2 |
| ; MAX-BANDWIDTH-NEXT: # %bb.1: |
| ; MAX-BANDWIDTH-NEXT: local.get 3 |
| ; MAX-BANDWIDTH-NEXT: local.set 12 |
| ; MAX-BANDWIDTH-NEXT: br 1 # 1: down to label1 |
| ; MAX-BANDWIDTH-NEXT: .LBB0_2: # %vector.ph |
| ; MAX-BANDWIDTH-NEXT: end_block # label2: |
| ; MAX-BANDWIDTH-NEXT: v128.const 0, 0, 0, 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 10 |
| ; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 14 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 9 |
| ; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 15 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 8 |
| ; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 16 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 7 |
| ; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 17 |
| ; MAX-BANDWIDTH-NEXT: local.get 3 |
| ; MAX-BANDWIDTH-NEXT: local.get 5 |
| ; MAX-BANDWIDTH-NEXT: i32.const -16 |
| ; MAX-BANDWIDTH-NEXT: i32.and |
| ; MAX-BANDWIDTH-NEXT: local.tee 11 |
| ; MAX-BANDWIDTH-NEXT: i32.const 2 |
| ; MAX-BANDWIDTH-NEXT: i32.shl |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 12 |
| ; MAX-BANDWIDTH-NEXT: local.get 4 |
| ; MAX-BANDWIDTH-NEXT: local.set 10 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: local.set 9 |
| ; MAX-BANDWIDTH-NEXT: .LBB0_3: # %vector.body |
| ; MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; MAX-BANDWIDTH-NEXT: loop # label3: |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: local.get 10 |
| ; MAX-BANDWIDTH-NEXT: v128.load 48:p2align=2 |
| ; MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 3 |
| ; MAX-BANDWIDTH-NEXT: local.tee 8 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 7 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 2 |
| ; MAX-BANDWIDTH-NEXT: local.tee 18 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 19 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 1 |
| ; MAX-BANDWIDTH-NEXT: local.tee 20 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 21 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 22 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 23 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: local.get 10 |
| ; MAX-BANDWIDTH-NEXT: v128.load 32:p2align=2 |
| ; MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 3 |
| ; MAX-BANDWIDTH-NEXT: local.tee 24 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 25 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 2 |
| ; MAX-BANDWIDTH-NEXT: local.tee 26 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 27 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 1 |
| ; MAX-BANDWIDTH-NEXT: local.tee 28 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 29 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 30 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 31 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: local.get 10 |
| ; MAX-BANDWIDTH-NEXT: v128.load 16:p2align=2 |
| ; MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 3 |
| ; MAX-BANDWIDTH-NEXT: local.tee 32 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 33 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 2 |
| ; MAX-BANDWIDTH-NEXT: local.tee 34 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 35 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 1 |
| ; MAX-BANDWIDTH-NEXT: local.tee 36 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 37 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 38 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 39 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: local.get 10 |
| ; MAX-BANDWIDTH-NEXT: v128.load 0:p2align=2 |
| ; MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 3 |
| ; MAX-BANDWIDTH-NEXT: local.tee 40 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 41 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 2 |
| ; MAX-BANDWIDTH-NEXT: local.tee 42 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 43 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 1 |
| ; MAX-BANDWIDTH-NEXT: local.tee 44 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 45 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 46 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 47 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: v128.load8_splat 0 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 1 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 2 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 3 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 4 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 5 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 6 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 7 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 8 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 9 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 10 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 11 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 12 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 13 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 14 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 15 |
| ; MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 3 |
| ; MAX-BANDWIDTH-NEXT: v128.load 0:p2align=0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 48 |
| ; MAX-BANDWIDTH-NEXT: local.get 3 |
| ; MAX-BANDWIDTH-NEXT: v128.load 16:p2align=0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 49 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 |
| ; MAX-BANDWIDTH-NEXT: local.get 3 |
| ; MAX-BANDWIDTH-NEXT: v128.load 32:p2align=0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 50 |
| ; MAX-BANDWIDTH-NEXT: local.get 3 |
| ; MAX-BANDWIDTH-NEXT: v128.load 48:p2align=0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 51 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 3, 7, 11, 15, 19, 23, 27, 31 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 |
| ; MAX-BANDWIDTH-NEXT: local.tee 52 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 52 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.get 7 |
| ; MAX-BANDWIDTH-NEXT: local.get 19 |
| ; MAX-BANDWIDTH-NEXT: local.get 21 |
| ; MAX-BANDWIDTH-NEXT: local.get 23 |
| ; MAX-BANDWIDTH-NEXT: local.get 25 |
| ; MAX-BANDWIDTH-NEXT: local.get 27 |
| ; MAX-BANDWIDTH-NEXT: local.get 29 |
| ; MAX-BANDWIDTH-NEXT: local.get 31 |
| ; MAX-BANDWIDTH-NEXT: local.get 33 |
| ; MAX-BANDWIDTH-NEXT: local.get 35 |
| ; MAX-BANDWIDTH-NEXT: local.get 37 |
| ; MAX-BANDWIDTH-NEXT: local.get 39 |
| ; MAX-BANDWIDTH-NEXT: local.get 41 |
| ; MAX-BANDWIDTH-NEXT: local.get 43 |
| ; MAX-BANDWIDTH-NEXT: local.get 45 |
| ; MAX-BANDWIDTH-NEXT: local.get 47 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_splat 0 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 1 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 2 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 3 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 4 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 5 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 6 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 7 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 8 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 9 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 10 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 11 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 12 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 13 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 14 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 15 |
| ; MAX-BANDWIDTH-NEXT: local.tee 53 |
| ; MAX-BANDWIDTH-NEXT: local.get 48 |
| ; MAX-BANDWIDTH-NEXT: local.get 49 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 |
| ; MAX-BANDWIDTH-NEXT: local.get 50 |
| ; MAX-BANDWIDTH-NEXT: local.get 51 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 9, 13, 17, 21, 25, 29 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 |
| ; MAX-BANDWIDTH-NEXT: local.tee 54 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: local.get 53 |
| ; MAX-BANDWIDTH-NEXT: local.get 54 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.get 14 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.set 14 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 48 |
| ; MAX-BANDWIDTH-NEXT: local.get 49 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 |
| ; MAX-BANDWIDTH-NEXT: local.get 50 |
| ; MAX-BANDWIDTH-NEXT: local.get 51 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 10, 14, 18, 22, 26, 30 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 |
| ; MAX-BANDWIDTH-NEXT: local.tee 55 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 55 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.get 53 |
| ; MAX-BANDWIDTH-NEXT: local.get 48 |
| ; MAX-BANDWIDTH-NEXT: local.get 49 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 |
| ; MAX-BANDWIDTH-NEXT: local.get 50 |
| ; MAX-BANDWIDTH-NEXT: local.get 51 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 8, 12, 16, 20, 24, 28 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 |
| ; MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: local.get 53 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.get 16 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.set 16 |
| ; MAX-BANDWIDTH-NEXT: local.get 52 |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 8 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 8 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 18 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 7 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 20 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 18 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 22 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 19 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 24 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 20 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 26 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 21 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 28 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 22 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 30 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 23 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 32 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 24 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 34 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 25 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 36 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 26 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 38 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 27 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 40 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 28 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 42 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 29 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 44 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 30 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 46 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 31 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: v128.load8_splat 0 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 1 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 2 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 3 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 4 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 5 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 6 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 7 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 8 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 9 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 10 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 11 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 12 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 13 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 14 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 15 |
| ; MAX-BANDWIDTH-NEXT: local.tee 48 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: local.get 52 |
| ; MAX-BANDWIDTH-NEXT: local.get 48 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.get 54 |
| ; MAX-BANDWIDTH-NEXT: local.get 8 |
| ; MAX-BANDWIDTH-NEXT: local.get 7 |
| ; MAX-BANDWIDTH-NEXT: local.get 18 |
| ; MAX-BANDWIDTH-NEXT: local.get 19 |
| ; MAX-BANDWIDTH-NEXT: local.get 20 |
| ; MAX-BANDWIDTH-NEXT: local.get 21 |
| ; MAX-BANDWIDTH-NEXT: local.get 22 |
| ; MAX-BANDWIDTH-NEXT: local.get 23 |
| ; MAX-BANDWIDTH-NEXT: local.get 24 |
| ; MAX-BANDWIDTH-NEXT: local.get 25 |
| ; MAX-BANDWIDTH-NEXT: local.get 26 |
| ; MAX-BANDWIDTH-NEXT: local.get 27 |
| ; MAX-BANDWIDTH-NEXT: local.get 28 |
| ; MAX-BANDWIDTH-NEXT: local.get 29 |
| ; MAX-BANDWIDTH-NEXT: local.get 30 |
| ; MAX-BANDWIDTH-NEXT: local.get 31 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_splat 0 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 1 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 2 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 3 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 4 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 5 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 6 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 7 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 8 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 9 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 10 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 11 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 12 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 13 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 14 |
| ; MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 15 |
| ; MAX-BANDWIDTH-NEXT: local.tee 49 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: local.get 54 |
| ; MAX-BANDWIDTH-NEXT: local.get 49 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.get 15 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.set 15 |
| ; MAX-BANDWIDTH-NEXT: local.get 55 |
| ; MAX-BANDWIDTH-NEXT: local.get 48 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: local.get 55 |
| ; MAX-BANDWIDTH-NEXT: local.get 48 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 49 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 49 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.get 17 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.set 17 |
| ; MAX-BANDWIDTH-NEXT: local.get 3 |
| ; MAX-BANDWIDTH-NEXT: i32.const 64 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 3 |
| ; MAX-BANDWIDTH-NEXT: local.get 10 |
| ; MAX-BANDWIDTH-NEXT: i32.const 64 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 10 |
| ; MAX-BANDWIDTH-NEXT: local.get 9 |
| ; MAX-BANDWIDTH-NEXT: i32.const -16 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 9 |
| ; MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label3 |
| ; MAX-BANDWIDTH-NEXT: # %bb.4: # %middle.block |
| ; MAX-BANDWIDTH-NEXT: end_loop |
| ; MAX-BANDWIDTH-NEXT: local.get 14 |
| ; MAX-BANDWIDTH-NEXT: local.get 14 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 10 |
| ; MAX-BANDWIDTH-NEXT: local.get 15 |
| ; MAX-BANDWIDTH-NEXT: local.get 15 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 9 |
| ; MAX-BANDWIDTH-NEXT: local.get 16 |
| ; MAX-BANDWIDTH-NEXT: local.get 16 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 8 |
| ; MAX-BANDWIDTH-NEXT: local.get 17 |
| ; MAX-BANDWIDTH-NEXT: local.get 17 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 7 |
| ; MAX-BANDWIDTH-NEXT: local.get 5 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: i32.eq |
| ; MAX-BANDWIDTH-NEXT: br_if 1 # 1: down to label0 |
| ; MAX-BANDWIDTH-NEXT: .LBB0_5: # %scalar.ph |
| ; MAX-BANDWIDTH-NEXT: end_block # label1: |
| ; MAX-BANDWIDTH-NEXT: local.get 5 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: i32.sub |
| ; MAX-BANDWIDTH-NEXT: local.set 18 |
| ; MAX-BANDWIDTH-NEXT: local.get 4 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: i32.const 2 |
| ; MAX-BANDWIDTH-NEXT: i32.shl |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 3 |
| ; MAX-BANDWIDTH-NEXT: .LBB0_6: # %bb2053.loop |
| ; MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; MAX-BANDWIDTH-NEXT: loop # label4: |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: local.get 3 |
| ; MAX-BANDWIDTH-NEXT: i32.load 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 19 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 20 |
| ; MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 21 |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: i32.const 1 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 22 |
| ; MAX-BANDWIDTH-NEXT: i32.mul |
| ; MAX-BANDWIDTH-NEXT: local.get 10 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 20 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 20 |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: i32.const 3 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 23 |
| ; MAX-BANDWIDTH-NEXT: i32.mul |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 10 |
| ; MAX-BANDWIDTH-NEXT: local.get 22 |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 19 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 19 |
| ; MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 24 |
| ; MAX-BANDWIDTH-NEXT: i32.mul |
| ; MAX-BANDWIDTH-NEXT: local.get 9 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 23 |
| ; MAX-BANDWIDTH-NEXT: local.get 19 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 19 |
| ; MAX-BANDWIDTH-NEXT: i32.mul |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 9 |
| ; MAX-BANDWIDTH-NEXT: local.get 21 |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 22 |
| ; MAX-BANDWIDTH-NEXT: i32.mul |
| ; MAX-BANDWIDTH-NEXT: local.get 8 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 20 |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: i32.const 2 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 21 |
| ; MAX-BANDWIDTH-NEXT: i32.mul |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 8 |
| ; MAX-BANDWIDTH-NEXT: local.get 22 |
| ; MAX-BANDWIDTH-NEXT: local.get 24 |
| ; MAX-BANDWIDTH-NEXT: i32.mul |
| ; MAX-BANDWIDTH-NEXT: local.get 7 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.get 21 |
| ; MAX-BANDWIDTH-NEXT: local.get 19 |
| ; MAX-BANDWIDTH-NEXT: i32.mul |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 7 |
| ; MAX-BANDWIDTH-NEXT: local.get 3 |
| ; MAX-BANDWIDTH-NEXT: i32.const 4 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 3 |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: i32.const 4 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 12 |
| ; MAX-BANDWIDTH-NEXT: local.get 18 |
| ; MAX-BANDWIDTH-NEXT: i32.const -1 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 18 |
| ; MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label4 |
| ; MAX-BANDWIDTH-NEXT: .LBB0_7: # %bb2053.exit |
| ; MAX-BANDWIDTH-NEXT: end_loop |
| ; MAX-BANDWIDTH-NEXT: end_block # label0: |
| ; MAX-BANDWIDTH-NEXT: local.get 0 |
| ; MAX-BANDWIDTH-NEXT: local.get 10 |
| ; MAX-BANDWIDTH-NEXT: i32.store 12 |
| ; MAX-BANDWIDTH-NEXT: local.get 0 |
| ; MAX-BANDWIDTH-NEXT: local.get 9 |
| ; MAX-BANDWIDTH-NEXT: i32.store 8 |
| ; MAX-BANDWIDTH-NEXT: local.get 0 |
| ; MAX-BANDWIDTH-NEXT: local.get 8 |
| ; MAX-BANDWIDTH-NEXT: i32.store 4 |
| ; MAX-BANDWIDTH-NEXT: local.get 0 |
| ; MAX-BANDWIDTH-NEXT: local.get 7 |
| ; MAX-BANDWIDTH-NEXT: i32.store 0 |
| ; MAX-BANDWIDTH-NEXT: # fallthrough-return |
| ; |
| ; RELAXED-MAX-BANDWIDTH-LABEL: bb2053_inner_loop: |
| ; RELAXED-MAX-BANDWIDTH: .functype bb2053_inner_loop (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> () |
| ; RELAXED-MAX-BANDWIDTH-NEXT: .local i32, i32, v128, v128, v128, v128, v128, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, v128, v128, v128, v128, v128, v128, v128, v128 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: # %bb.0: # %entry |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: block |
| ; RELAXED-MAX-BANDWIDTH-NEXT: block |
| ; RELAXED-MAX-BANDWIDTH-NEXT: block |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 5 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 16 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.ge_u |
| ; RELAXED-MAX-BANDWIDTH-NEXT: br_if 0 # 0: down to label2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: # %bb.1: |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: br 1 # 1: down to label1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: .LBB0_2: # %vector.ph |
| ; RELAXED-MAX-BANDWIDTH-NEXT: end_block # label2: |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.const 0, 0, 0, 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 14 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 15 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 16 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 17 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 5 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const -16 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.and |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.shl |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: .LBB0_3: # %vector.body |
| ; RELAXED-MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: loop # label3: |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 48:p2align=2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 18 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 19 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 20 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 21 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 22 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 23 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 32:p2align=2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 24 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 25 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 26 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 27 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 28 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 29 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 30 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 31 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 16:p2align=2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 32 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 33 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 34 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 35 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 36 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 37 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 38 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 39 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 0:p2align=2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 40 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 41 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 42 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 43 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 44 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 45 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 46 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 47 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_splat 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 5 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 14 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 15 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 48 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 0:p2align=0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 16:p2align=0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 49 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 32:p2align=0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 50 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 48:p2align=0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 51 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 3, 7, 11, 15, 19, 23, 27, 31 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 52 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 19 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 21 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 23 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 25 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 27 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 29 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 31 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 33 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 35 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 37 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 39 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 41 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 43 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 45 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 47 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_splat 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 5 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 14 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 15 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 53 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 49 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 50 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 51 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 9, 13, 17, 21, 25, 29 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 54 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 14 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 14 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 48 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 49 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 50 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 51 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 10, 14, 18, 22, 26, 30 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 55 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 53 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 49 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 50 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 51 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 8, 12, 16, 20, 24, 28 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 16 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 16 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 52 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 18 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 20 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 18 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 22 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 19 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 24 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 20 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 26 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 21 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 28 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 22 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 30 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 23 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 32 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 24 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 34 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 25 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 36 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 26 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 38 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 27 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 40 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 28 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 42 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 29 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 44 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 30 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 46 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 31 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_splat 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 5 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 14 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 15 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 49 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 54 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 18 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 19 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 20 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 21 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 22 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 23 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 24 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 25 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 26 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 27 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 28 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 29 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 30 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 31 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_splat 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 5 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 14 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load8_lane 0, 15 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 50 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 15 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 15 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 55 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 49 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 50 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 17 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 17 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 64 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 64 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const -16 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: # %bb.4: # %middle.block |
| ; RELAXED-MAX-BANDWIDTH-NEXT: end_loop |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 14 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 14 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 15 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 15 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 16 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 16 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 17 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 17 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 5 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.eq |
| ; RELAXED-MAX-BANDWIDTH-NEXT: br_if 1 # 1: down to label0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: .LBB0_5: # %scalar.ph |
| ; RELAXED-MAX-BANDWIDTH-NEXT: end_block # label1: |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 5 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.sub |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 18 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.shl |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: .LBB0_6: # %bb2053.loop |
| ; RELAXED-MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: loop # label4: |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.load 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 19 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 20 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 21 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 22 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 20 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 20 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 23 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 22 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 19 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 19 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 24 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 23 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 19 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 19 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 21 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 22 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 20 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 21 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 22 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 24 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 21 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 19 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 18 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const -1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 18 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: .LBB0_7: # %bb2053.exit |
| ; RELAXED-MAX-BANDWIDTH-NEXT: end_loop |
| ; RELAXED-MAX-BANDWIDTH-NEXT: end_block # label0: |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: # fallthrough-return |
| entry: |
| br label %bb2053.loop |
| |
| bb2053.loop: |
| %idx = phi i32 [ 0, %entry ], [ %idx.next, %bb2053.loop ] |
| %accA = phi i32 [ %acc0, %entry ], [ %accA.sum, %bb2053.loop ] |
| %accB = phi i32 [ %acc1, %entry ], [ %accB.sum, %bb2053.loop ] |
| %accC = phi i32 [ %acc2, %entry ], [ %accC.sum, %bb2053.loop ] |
| %accD = phi i32 [ %acc3, %entry ], [ %accD.sum, %bb2053.loop ] |
| %wptr = phi ptr [ %weights, %entry ], [ %wptr.next, %bb2053.loop ] |
| %idx.ptr = getelementptr inbounds nuw i32, ptr %indices, i32 %idx |
| %idx.val = load i32, ptr %idx.ptr, align 4 |
| %lhs0.ptr = getelementptr inbounds i8, ptr %base0, i32 %idx.val |
| %rhs0.ptr = getelementptr inbounds i8, ptr %base1, i32 %idx.val |
| %lhs0 = load i8, ptr %lhs0.ptr, align 1 |
| %lhs0.sext = sext i8 %lhs0 to i32 |
| %w0 = load i8, ptr %wptr, align 1 |
| %w0.sext = sext i8 %w0 to i32 |
| %mul0 = mul nsw i32 %w0.sext, %lhs0.sext |
| %accA.next = add nsw i32 %mul0, %accA |
| %w1.ptr = getelementptr inbounds nuw i8, ptr %wptr, i32 1 |
| %w1 = load i8, ptr %w1.ptr, align 1 |
| %w1.sext = sext i8 %w1 to i32 |
| %mul1 = mul nsw i32 %w1.sext, %lhs0.sext |
| %accC.next = add nsw i32 %mul1, %accC |
| %lhs1.ptr = getelementptr inbounds nuw i8, ptr %lhs0.ptr, i32 %stride |
| %lhs1 = load i8, ptr %lhs1.ptr, align 1 |
| %lhs1.sext = sext i8 %lhs1 to i32 |
| %w2.ptr = getelementptr inbounds nuw i8, ptr %wptr, i32 2 |
| %w2 = load i8, ptr %w2.ptr, align 1 |
| %w2.sext = sext i8 %w2 to i32 |
| %mul2 = mul nsw i32 %w2.sext, %lhs1.sext |
| %accA.sum = add nsw i32 %accA.next, %mul2 |
| %w3.ptr = getelementptr inbounds nuw i8, ptr %wptr, i32 3 |
| %w3 = load i8, ptr %w3.ptr, align 1 |
| %w3.sext = sext i8 %w3 to i32 |
| %mul3 = mul nsw i32 %w3.sext, %lhs1.sext |
| %accC.sum = add nsw i32 %accC.next, %mul3 |
| %rhs0 = load i8, ptr %rhs0.ptr, align 1 |
| %rhs0.sext = sext i8 %rhs0 to i32 |
| %mul4 = mul nsw i32 %rhs0.sext, %w0.sext |
| %accB.next = add nsw i32 %mul4, %accB |
| %mul5 = mul nsw i32 %rhs0.sext, %w1.sext |
| %accD.next = add nsw i32 %mul5, %accD |
| %rhs1.ptr = getelementptr inbounds nuw i8, ptr %rhs0.ptr, i32 %stride |
| %rhs1 = load i8, ptr %rhs1.ptr, align 1 |
| %rhs1.sext = sext i8 %rhs1 to i32 |
| %mul6 = mul nsw i32 %rhs1.sext, %w2.sext |
| %accB.sum = add nsw i32 %accB.next, %mul6 |
| %mul7 = mul nsw i32 %rhs1.sext, %w3.sext |
| %accD.sum = add nsw i32 %accD.next, %mul7 |
| %wptr.next = getelementptr inbounds nuw i8, ptr %wptr, i32 4 |
| %idx.next = add nuw nsw i32 %idx, 1 |
| %exit = icmp eq i32 %idx.next, %len |
| br i1 %exit, label %bb2053.exit, label %bb2053.loop |
| |
| bb2053.exit: |
| %res0 = insertvalue { i32, i32, i32, i32 } poison, i32 %accA.sum, 0 |
| %res1 = insertvalue { i32, i32, i32, i32 } %res0, i32 %accB.sum, 1 |
| %res2 = insertvalue { i32, i32, i32, i32 } %res1, i32 %accC.sum, 2 |
| %res3 = insertvalue { i32, i32, i32, i32 } %res2, i32 %accD.sum, 3 |
| ret { i32, i32, i32, i32 } %res3 |
| } |
| |
| define hidden { i32, i32, i32, i32 } @bb41_inner_loop(ptr nocapture %lhs, ptr nocapture %rhs, i32 %len, i32 %acc00, i32 %acc01, i32 %acc10, i32 %acc11) local_unnamed_addr { |
| ; CHECK-LABEL: bb41_inner_loop: |
| ; CHECK: .functype bb41_inner_loop (i32, i32, i32, i32, i32, i32, i32, i32) -> () |
| ; CHECK-NEXT: .local i32, i32, i32, v128, v128, v128, v128, v128, v128, v128, v128, i32 |
| ; CHECK-NEXT: # %bb.0: # %entry |
| ; CHECK-NEXT: i32.const 0 |
| ; CHECK-NEXT: local.set 8 |
| ; CHECK-NEXT: block |
| ; CHECK-NEXT: block |
| ; CHECK-NEXT: block |
| ; CHECK-NEXT: local.get 3 |
| ; CHECK-NEXT: i32.const 4 |
| ; CHECK-NEXT: i32.ge_u |
| ; CHECK-NEXT: br_if 0 # 0: down to label7 |
| ; CHECK-NEXT: # %bb.1: |
| ; CHECK-NEXT: local.get 1 |
| ; CHECK-NEXT: local.set 9 |
| ; CHECK-NEXT: local.get 2 |
| ; CHECK-NEXT: local.set 10 |
| ; CHECK-NEXT: br 1 # 1: down to label6 |
| ; CHECK-NEXT: .LBB1_2: # %vector.ph |
| ; CHECK-NEXT: end_block # label7: |
| ; CHECK-NEXT: v128.const 0, 0, 0, 0 |
| ; CHECK-NEXT: local.tee 11 |
| ; CHECK-NEXT: local.get 7 |
| ; CHECK-NEXT: i32x4.replace_lane 0 |
| ; CHECK-NEXT: local.set 12 |
| ; CHECK-NEXT: local.get 11 |
| ; CHECK-NEXT: local.get 6 |
| ; CHECK-NEXT: i32x4.replace_lane 0 |
| ; CHECK-NEXT: local.set 13 |
| ; CHECK-NEXT: local.get 11 |
| ; CHECK-NEXT: local.get 5 |
| ; CHECK-NEXT: i32x4.replace_lane 0 |
| ; CHECK-NEXT: local.set 14 |
| ; CHECK-NEXT: local.get 11 |
| ; CHECK-NEXT: local.get 4 |
| ; CHECK-NEXT: i32x4.replace_lane 0 |
| ; CHECK-NEXT: local.set 11 |
| ; CHECK-NEXT: local.get 2 |
| ; CHECK-NEXT: local.get 3 |
| ; CHECK-NEXT: i32.const -4 |
| ; CHECK-NEXT: i32.and |
| ; CHECK-NEXT: local.tee 8 |
| ; CHECK-NEXT: i32.const 1 |
| ; CHECK-NEXT: i32.shl |
| ; CHECK-NEXT: local.tee 9 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 10 |
| ; CHECK-NEXT: local.get 1 |
| ; CHECK-NEXT: local.get 9 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 9 |
| ; CHECK-NEXT: local.get 8 |
| ; CHECK-NEXT: local.set 7 |
| ; CHECK-NEXT: .LBB1_3: # %vector.body |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: loop # label8: |
| ; CHECK-NEXT: local.get 1 |
| ; CHECK-NEXT: v128.load64_zero 0:p2align=0 |
| ; CHECK-NEXT: local.tee 15 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| ; CHECK-NEXT: i16x8.extend_low_i8x16_s |
| ; CHECK-NEXT: local.tee 16 |
| ; CHECK-NEXT: local.get 2 |
| ; CHECK-NEXT: v128.load64_zero 0:p2align=0 |
| ; CHECK-NEXT: local.tee 17 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| ; CHECK-NEXT: i16x8.extend_low_i8x16_s |
| ; CHECK-NEXT: local.tee 18 |
| ; CHECK-NEXT: i32x4.extmul_low_i16x8_s |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.set 12 |
| ; CHECK-NEXT: local.get 16 |
| ; CHECK-NEXT: local.get 17 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| ; CHECK-NEXT: i16x8.extend_low_i8x16_s |
| ; CHECK-NEXT: local.tee 17 |
| ; CHECK-NEXT: i32x4.extmul_low_i16x8_s |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.set 13 |
| ; CHECK-NEXT: local.get 18 |
| ; CHECK-NEXT: local.get 15 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| ; CHECK-NEXT: i16x8.extend_low_i8x16_s |
| ; CHECK-NEXT: local.tee 15 |
| ; CHECK-NEXT: i32x4.extmul_low_i16x8_s |
| ; CHECK-NEXT: local.get 14 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.set 14 |
| ; CHECK-NEXT: local.get 17 |
| ; CHECK-NEXT: local.get 15 |
| ; CHECK-NEXT: i32x4.extmul_low_i16x8_s |
| ; CHECK-NEXT: local.get 11 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.set 11 |
| ; CHECK-NEXT: local.get 1 |
| ; CHECK-NEXT: i32.const 8 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 1 |
| ; CHECK-NEXT: local.get 2 |
| ; CHECK-NEXT: i32.const 8 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 2 |
| ; CHECK-NEXT: local.get 7 |
| ; CHECK-NEXT: i32.const -4 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.tee 7 |
| ; CHECK-NEXT: br_if 0 # 0: up to label8 |
| ; CHECK-NEXT: # %bb.4: # %middle.block |
| ; CHECK-NEXT: end_loop |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.tee 12 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: i32x4.extract_lane 0 |
| ; CHECK-NEXT: local.set 7 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.tee 12 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: i32x4.extract_lane 0 |
| ; CHECK-NEXT: local.set 6 |
| ; CHECK-NEXT: local.get 14 |
| ; CHECK-NEXT: local.get 14 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.tee 12 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: i32x4.extract_lane 0 |
| ; CHECK-NEXT: local.set 5 |
| ; CHECK-NEXT: local.get 11 |
| ; CHECK-NEXT: local.get 11 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.tee 12 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: i32x4.extract_lane 0 |
| ; CHECK-NEXT: local.set 4 |
| ; CHECK-NEXT: local.get 3 |
| ; CHECK-NEXT: local.get 8 |
| ; CHECK-NEXT: i32.eq |
| ; CHECK-NEXT: br_if 1 # 1: down to label5 |
| ; CHECK-NEXT: .LBB1_5: # %scalar.ph |
| ; CHECK-NEXT: end_block # label6: |
| ; CHECK-NEXT: local.get 3 |
| ; CHECK-NEXT: local.get 8 |
| ; CHECK-NEXT: i32.sub |
| ; CHECK-NEXT: local.set 2 |
| ; CHECK-NEXT: .LBB1_6: # %bb41 |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: loop # label9: |
| ; CHECK-NEXT: local.get 9 |
| ; CHECK-NEXT: i32.const 1 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: i32.load8_s 0 |
| ; CHECK-NEXT: local.tee 1 |
| ; CHECK-NEXT: local.get 10 |
| ; CHECK-NEXT: i32.const 1 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: i32.load8_s 0 |
| ; CHECK-NEXT: local.tee 3 |
| ; CHECK-NEXT: i32.mul |
| ; CHECK-NEXT: local.get 7 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 7 |
| ; CHECK-NEXT: local.get 10 |
| ; CHECK-NEXT: i32.load8_s 0 |
| ; CHECK-NEXT: local.tee 8 |
| ; CHECK-NEXT: local.get 9 |
| ; CHECK-NEXT: i32.load8_s 0 |
| ; CHECK-NEXT: local.tee 19 |
| ; CHECK-NEXT: i32.mul |
| ; CHECK-NEXT: local.get 4 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 4 |
| ; CHECK-NEXT: local.get 1 |
| ; CHECK-NEXT: local.get 8 |
| ; CHECK-NEXT: i32.mul |
| ; CHECK-NEXT: local.get 6 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 6 |
| ; CHECK-NEXT: local.get 3 |
| ; CHECK-NEXT: local.get 19 |
| ; CHECK-NEXT: i32.mul |
| ; CHECK-NEXT: local.get 5 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 5 |
| ; CHECK-NEXT: local.get 9 |
| ; CHECK-NEXT: i32.const 2 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 9 |
| ; CHECK-NEXT: local.get 10 |
| ; CHECK-NEXT: i32.const 2 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 10 |
| ; CHECK-NEXT: local.get 2 |
| ; CHECK-NEXT: i32.const -1 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.tee 2 |
| ; CHECK-NEXT: br_if 0 # 0: up to label9 |
| ; CHECK-NEXT: .LBB1_7: # %bb41.exit |
| ; CHECK-NEXT: end_loop |
| ; CHECK-NEXT: end_block # label5: |
| ; CHECK-NEXT: local.get 0 |
| ; CHECK-NEXT: local.get 7 |
| ; CHECK-NEXT: i32.store 12 |
| ; CHECK-NEXT: local.get 0 |
| ; CHECK-NEXT: local.get 6 |
| ; CHECK-NEXT: i32.store 8 |
| ; CHECK-NEXT: local.get 0 |
| ; CHECK-NEXT: local.get 5 |
| ; CHECK-NEXT: i32.store 4 |
| ; CHECK-NEXT: local.get 0 |
| ; CHECK-NEXT: local.get 4 |
| ; CHECK-NEXT: i32.store 0 |
| ; CHECK-NEXT: # fallthrough-return |
| ; |
| ; MAX-BANDWIDTH-LABEL: bb41_inner_loop: |
| ; MAX-BANDWIDTH: .functype bb41_inner_loop (i32, i32, i32, i32, i32, i32, i32, i32) -> () |
| ; MAX-BANDWIDTH-NEXT: .local i32, i32, i32, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, i32 |
| ; MAX-BANDWIDTH-NEXT: # %bb.0: # %entry |
| ; MAX-BANDWIDTH-NEXT: i32.const 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 8 |
| ; MAX-BANDWIDTH-NEXT: block |
| ; MAX-BANDWIDTH-NEXT: block |
| ; MAX-BANDWIDTH-NEXT: block |
| ; MAX-BANDWIDTH-NEXT: local.get 3 |
| ; MAX-BANDWIDTH-NEXT: i32.const 16 |
| ; MAX-BANDWIDTH-NEXT: i32.ge_u |
| ; MAX-BANDWIDTH-NEXT: br_if 0 # 0: down to label7 |
| ; MAX-BANDWIDTH-NEXT: # %bb.1: |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.set 9 |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: local.set 10 |
| ; MAX-BANDWIDTH-NEXT: br 1 # 1: down to label6 |
| ; MAX-BANDWIDTH-NEXT: .LBB1_2: # %vector.ph |
| ; MAX-BANDWIDTH-NEXT: end_block # label7: |
| ; MAX-BANDWIDTH-NEXT: v128.const 0, 0, 0, 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 7 |
| ; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 12 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 5 |
| ; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 14 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 4 |
| ; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 15 |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: local.get 3 |
| ; MAX-BANDWIDTH-NEXT: i32.const -16 |
| ; MAX-BANDWIDTH-NEXT: i32.and |
| ; MAX-BANDWIDTH-NEXT: local.tee 8 |
| ; MAX-BANDWIDTH-NEXT: i32.const 1 |
| ; MAX-BANDWIDTH-NEXT: i32.shl |
| ; MAX-BANDWIDTH-NEXT: local.tee 9 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 10 |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 9 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 9 |
| ; MAX-BANDWIDTH-NEXT: local.get 8 |
| ; MAX-BANDWIDTH-NEXT: local.set 7 |
| ; MAX-BANDWIDTH-NEXT: .LBB1_3: # %vector.body |
| ; MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; MAX-BANDWIDTH-NEXT: loop # label8: |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: v128.load 0:p2align=0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 16 |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: v128.load 16:p2align=0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 17 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 |
| ; MAX-BANDWIDTH-NEXT: local.tee 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: v128.load 0:p2align=0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 18 |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: v128.load 16:p2align=0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 19 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 |
| ; MAX-BANDWIDTH-NEXT: local.tee 20 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 20 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.set 12 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 18 |
| ; MAX-BANDWIDTH-NEXT: local.get 19 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 |
| ; MAX-BANDWIDTH-NEXT: local.tee 18 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 18 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.set 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 20 |
| ; MAX-BANDWIDTH-NEXT: local.get 16 |
| ; MAX-BANDWIDTH-NEXT: local.get 17 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 |
| ; MAX-BANDWIDTH-NEXT: local.tee 11 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: local.get 20 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.get 14 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.set 14 |
| ; MAX-BANDWIDTH-NEXT: local.get 18 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_low_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: local.get 18 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: i16x8.extmul_high_i8x16_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.extadd_pairwise_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.get 15 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.set 15 |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: i32.const 32 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: i32.const 32 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 2 |
| ; MAX-BANDWIDTH-NEXT: local.get 7 |
| ; MAX-BANDWIDTH-NEXT: i32.const -16 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 7 |
| ; MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label8 |
| ; MAX-BANDWIDTH-NEXT: # %bb.4: # %middle.block |
| ; MAX-BANDWIDTH-NEXT: end_loop |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 7 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 6 |
| ; MAX-BANDWIDTH-NEXT: local.get 14 |
| ; MAX-BANDWIDTH-NEXT: local.get 14 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 5 |
| ; MAX-BANDWIDTH-NEXT: local.get 15 |
| ; MAX-BANDWIDTH-NEXT: local.get 15 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 4 |
| ; MAX-BANDWIDTH-NEXT: local.get 3 |
| ; MAX-BANDWIDTH-NEXT: local.get 8 |
| ; MAX-BANDWIDTH-NEXT: i32.eq |
| ; MAX-BANDWIDTH-NEXT: br_if 1 # 1: down to label5 |
| ; MAX-BANDWIDTH-NEXT: .LBB1_5: # %scalar.ph |
| ; MAX-BANDWIDTH-NEXT: end_block # label6: |
| ; MAX-BANDWIDTH-NEXT: local.get 3 |
| ; MAX-BANDWIDTH-NEXT: local.get 8 |
| ; MAX-BANDWIDTH-NEXT: i32.sub |
| ; MAX-BANDWIDTH-NEXT: local.set 2 |
| ; MAX-BANDWIDTH-NEXT: .LBB1_6: # %bb41 |
| ; MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; MAX-BANDWIDTH-NEXT: loop # label9: |
| ; MAX-BANDWIDTH-NEXT: local.get 9 |
| ; MAX-BANDWIDTH-NEXT: i32.const 1 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 10 |
| ; MAX-BANDWIDTH-NEXT: i32.const 1 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 3 |
| ; MAX-BANDWIDTH-NEXT: i32.mul |
| ; MAX-BANDWIDTH-NEXT: local.get 7 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 7 |
| ; MAX-BANDWIDTH-NEXT: local.get 10 |
| ; MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 8 |
| ; MAX-BANDWIDTH-NEXT: local.get 9 |
| ; MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 21 |
| ; MAX-BANDWIDTH-NEXT: i32.mul |
| ; MAX-BANDWIDTH-NEXT: local.get 4 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 4 |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 8 |
| ; MAX-BANDWIDTH-NEXT: i32.mul |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 6 |
| ; MAX-BANDWIDTH-NEXT: local.get 3 |
| ; MAX-BANDWIDTH-NEXT: local.get 21 |
| ; MAX-BANDWIDTH-NEXT: i32.mul |
| ; MAX-BANDWIDTH-NEXT: local.get 5 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 5 |
| ; MAX-BANDWIDTH-NEXT: local.get 9 |
| ; MAX-BANDWIDTH-NEXT: i32.const 2 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 9 |
| ; MAX-BANDWIDTH-NEXT: local.get 10 |
| ; MAX-BANDWIDTH-NEXT: i32.const 2 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 10 |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: i32.const -1 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 2 |
| ; MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label9 |
| ; MAX-BANDWIDTH-NEXT: .LBB1_7: # %bb41.exit |
| ; MAX-BANDWIDTH-NEXT: end_loop |
| ; MAX-BANDWIDTH-NEXT: end_block # label5: |
| ; MAX-BANDWIDTH-NEXT: local.get 0 |
| ; MAX-BANDWIDTH-NEXT: local.get 7 |
| ; MAX-BANDWIDTH-NEXT: i32.store 12 |
| ; MAX-BANDWIDTH-NEXT: local.get 0 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.store 8 |
| ; MAX-BANDWIDTH-NEXT: local.get 0 |
| ; MAX-BANDWIDTH-NEXT: local.get 5 |
| ; MAX-BANDWIDTH-NEXT: i32.store 4 |
| ; MAX-BANDWIDTH-NEXT: local.get 0 |
| ; MAX-BANDWIDTH-NEXT: local.get 4 |
| ; MAX-BANDWIDTH-NEXT: i32.store 0 |
| ; MAX-BANDWIDTH-NEXT: # fallthrough-return |
| ; |
| ; RELAXED-MAX-BANDWIDTH-LABEL: bb41_inner_loop: |
| ; RELAXED-MAX-BANDWIDTH: .functype bb41_inner_loop (i32, i32, i32, i32, i32, i32, i32, i32) -> () |
| ; RELAXED-MAX-BANDWIDTH-NEXT: .local i32, i32, i32, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, i32 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: # %bb.0: # %entry |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: block |
| ; RELAXED-MAX-BANDWIDTH-NEXT: block |
| ; RELAXED-MAX-BANDWIDTH-NEXT: block |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 16 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.ge_u |
| ; RELAXED-MAX-BANDWIDTH-NEXT: br_if 0 # 0: down to label7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: # %bb.1: |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: br 1 # 1: down to label6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: .LBB1_2: # %vector.ph |
| ; RELAXED-MAX-BANDWIDTH-NEXT: end_block # label7: |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.const 0, 0, 0, 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 5 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 14 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const -16 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.and |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.shl |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: .LBB1_3: # %vector.body |
| ; RELAXED-MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: loop # label8: |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 0:p2align=0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 15 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 16:p2align=0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 16 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 17 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 0:p2align=0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 18 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 16:p2align=0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 19 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 20 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 17 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 18 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 19 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 18 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 20 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 15 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 16 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 15 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 14 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 14 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 18 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 15 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 32 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 32 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const -16 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: # %bb.4: # %middle.block |
| ; RELAXED-MAX-BANDWIDTH-NEXT: end_loop |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 14 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 14 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 5 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.eq |
| ; RELAXED-MAX-BANDWIDTH-NEXT: br_if 1 # 1: down to label5 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: .LBB1_5: # %scalar.ph |
| ; RELAXED-MAX-BANDWIDTH-NEXT: end_block # label6: |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.sub |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: .LBB1_6: # %bb41 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: loop # label9: |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.load8_s 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 21 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 21 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 5 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 5 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const -1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: .LBB1_7: # %bb41.exit |
| ; RELAXED-MAX-BANDWIDTH-NEXT: end_loop |
| ; RELAXED-MAX-BANDWIDTH-NEXT: end_block # label5: |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 5 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: # fallthrough-return |
| entry: |
| br label %bb41 |
| |
| bb41: |
| %idx = phi i32 [ 0, %entry ], [ %idx.next, %bb41 ] |
| %lhs.ptr = phi ptr [ %lhs, %entry ], [ %lhs.next, %bb41 ] |
| %rhs.ptr = phi ptr [ %rhs, %entry ], [ %rhs.next, %bb41 ] |
| %acc00.phi = phi i32 [ %acc00, %entry ], [ %acc00.next, %bb41 ] |
| %acc01.phi = phi i32 [ %acc01, %entry ], [ %acc01.next, %bb41 ] |
| %acc10.phi = phi i32 [ %acc10, %entry ], [ %acc10.next, %bb41 ] |
| %acc11.phi = phi i32 [ %acc11, %entry ], [ %acc11.next, %bb41 ] |
| %lhs0 = load i8, ptr %lhs.ptr, align 1 |
| %lhs0.sext = sext i8 %lhs0 to i32 |
| %rhs0 = load i8, ptr %rhs.ptr, align 1 |
| %rhs0.sext = sext i8 %rhs0 to i32 |
| %mul00 = mul nsw i32 %rhs0.sext, %lhs0.sext |
| %acc00.next = add nsw i32 %mul00, %acc00.phi |
| %rhs1.ptr = getelementptr inbounds nuw i8, ptr %rhs.ptr, i32 1 |
| %rhs1 = load i8, ptr %rhs1.ptr, align 1 |
| %rhs1.sext = sext i8 %rhs1 to i32 |
| %mul01 = mul nsw i32 %rhs1.sext, %lhs0.sext |
| %acc01.next = add nsw i32 %mul01, %acc01.phi |
| %lhs1.ptr = getelementptr inbounds nuw i8, ptr %lhs.ptr, i32 1 |
| %lhs1 = load i8, ptr %lhs1.ptr, align 1 |
| %lhs1.sext = sext i8 %lhs1 to i32 |
| %mul10 = mul nsw i32 %lhs1.sext, %rhs0.sext |
| %acc10.next = add nsw i32 %mul10, %acc10.phi |
| %mul11 = mul nsw i32 %lhs1.sext, %rhs1.sext |
| %acc11.next = add nsw i32 %mul11, %acc11.phi |
| %lhs.next = getelementptr inbounds nuw i8, ptr %lhs.ptr, i32 2 |
| %rhs.next = getelementptr inbounds nuw i8, ptr %rhs.ptr, i32 2 |
| %idx.next = add nuw nsw i32 %idx, 1 |
| %exit = icmp eq i32 %idx.next, %len |
| br i1 %exit, label %bb41.exit, label %bb41 |
| |
| bb41.exit: |
| %res0 = insertvalue { i32, i32, i32, i32 } poison, i32 %acc00.next, 0 |
| %res1 = insertvalue { i32, i32, i32, i32 } %res0, i32 %acc01.next, 1 |
| %res2 = insertvalue { i32, i32, i32, i32 } %res1, i32 %acc10.next, 2 |
| %res3 = insertvalue { i32, i32, i32, i32 } %res2, i32 %acc11.next, 3 |
| ret { i32, i32, i32, i32 } %res3 |
| } |
| |
| define hidden { i32, i32, i32, i32 } @bb41_inner_loop_i16(ptr nocapture %lhs, ptr nocapture %rhs, i32 %len, i32 %acc00, i32 %acc01, i32 %acc10, i32 %acc11) local_unnamed_addr { |
| ; CHECK-LABEL: bb41_inner_loop_i16: |
| ; CHECK: .functype bb41_inner_loop_i16 (i32, i32, i32, i32, i32, i32, i32, i32) -> () |
| ; CHECK-NEXT: .local i32, i32, i32, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, i32 |
| ; CHECK-NEXT: # %bb.0: # %entry |
| ; CHECK-NEXT: i32.const 0 |
| ; CHECK-NEXT: local.set 8 |
| ; CHECK-NEXT: block |
| ; CHECK-NEXT: block |
| ; CHECK-NEXT: local.get 3 |
| ; CHECK-NEXT: i32.const 5 |
| ; CHECK-NEXT: i32.ge_u |
| ; CHECK-NEXT: br_if 0 # 0: down to label11 |
| ; CHECK-NEXT: # %bb.1: |
| ; CHECK-NEXT: local.get 1 |
| ; CHECK-NEXT: local.set 9 |
| ; CHECK-NEXT: local.get 2 |
| ; CHECK-NEXT: local.set 10 |
| ; CHECK-NEXT: br 1 # 1: down to label10 |
| ; CHECK-NEXT: .LBB2_2: # %vector.ph |
| ; CHECK-NEXT: end_block # label11: |
| ; CHECK-NEXT: v128.const 0, 0, 0, 0 |
| ; CHECK-NEXT: local.tee 11 |
| ; CHECK-NEXT: local.get 7 |
| ; CHECK-NEXT: i32x4.replace_lane 0 |
| ; CHECK-NEXT: local.set 12 |
| ; CHECK-NEXT: local.get 11 |
| ; CHECK-NEXT: local.get 6 |
| ; CHECK-NEXT: i32x4.replace_lane 0 |
| ; CHECK-NEXT: local.set 13 |
| ; CHECK-NEXT: local.get 11 |
| ; CHECK-NEXT: local.get 5 |
| ; CHECK-NEXT: i32x4.replace_lane 0 |
| ; CHECK-NEXT: local.set 14 |
| ; CHECK-NEXT: local.get 11 |
| ; CHECK-NEXT: local.get 4 |
| ; CHECK-NEXT: i32x4.replace_lane 0 |
| ; CHECK-NEXT: local.set 11 |
| ; CHECK-NEXT: local.get 2 |
| ; CHECK-NEXT: local.get 3 |
| ; CHECK-NEXT: local.get 3 |
| ; CHECK-NEXT: i32.const 3 |
| ; CHECK-NEXT: i32.and |
| ; CHECK-NEXT: local.tee 9 |
| ; CHECK-NEXT: i32.const 4 |
| ; CHECK-NEXT: local.get 9 |
| ; CHECK-NEXT: i32.select |
| ; CHECK-NEXT: i32.sub |
| ; CHECK-NEXT: local.tee 8 |
| ; CHECK-NEXT: i32.const 3 |
| ; CHECK-NEXT: i32.shl |
| ; CHECK-NEXT: local.tee 9 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 10 |
| ; CHECK-NEXT: local.get 1 |
| ; CHECK-NEXT: local.get 9 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 9 |
| ; CHECK-NEXT: local.get 8 |
| ; CHECK-NEXT: local.set 7 |
| ; CHECK-NEXT: .LBB2_3: # %vector.body |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: loop # label12: |
| ; CHECK-NEXT: local.get 1 |
| ; CHECK-NEXT: v128.load 0:p2align=1 |
| ; CHECK-NEXT: local.tee 15 |
| ; CHECK-NEXT: local.get 1 |
| ; CHECK-NEXT: v128.load 16:p2align=1 |
| ; CHECK-NEXT: local.tee 16 |
| ; CHECK-NEXT: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 |
| ; CHECK-NEXT: local.tee 17 |
| ; CHECK-NEXT: local.get 2 |
| ; CHECK-NEXT: v128.load 0:p2align=1 |
| ; CHECK-NEXT: local.tee 18 |
| ; CHECK-NEXT: local.get 2 |
| ; CHECK-NEXT: v128.load 16:p2align=1 |
| ; CHECK-NEXT: local.tee 19 |
| ; CHECK-NEXT: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 |
| ; CHECK-NEXT: local.tee 20 |
| ; CHECK-NEXT: i32x4.extmul_low_i16x8_s |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.set 12 |
| ; CHECK-NEXT: local.get 17 |
| ; CHECK-NEXT: local.get 18 |
| ; CHECK-NEXT: local.get 19 |
| ; CHECK-NEXT: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 |
| ; CHECK-NEXT: local.tee 18 |
| ; CHECK-NEXT: i32x4.extmul_low_i16x8_s |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.set 13 |
| ; CHECK-NEXT: local.get 20 |
| ; CHECK-NEXT: local.get 15 |
| ; CHECK-NEXT: local.get 16 |
| ; CHECK-NEXT: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 |
| ; CHECK-NEXT: local.tee 15 |
| ; CHECK-NEXT: i32x4.extmul_low_i16x8_s |
| ; CHECK-NEXT: local.get 14 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.set 14 |
| ; CHECK-NEXT: local.get 18 |
| ; CHECK-NEXT: local.get 15 |
| ; CHECK-NEXT: i32x4.extmul_low_i16x8_s |
| ; CHECK-NEXT: local.get 11 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.set 11 |
| ; CHECK-NEXT: local.get 1 |
| ; CHECK-NEXT: i32.const 32 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 1 |
| ; CHECK-NEXT: local.get 2 |
| ; CHECK-NEXT: i32.const 32 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 2 |
| ; CHECK-NEXT: local.get 7 |
| ; CHECK-NEXT: i32.const -4 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.tee 7 |
| ; CHECK-NEXT: br_if 0 # 0: up to label12 |
| ; CHECK-NEXT: # %bb.4: # %middle.block |
| ; CHECK-NEXT: end_loop |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.tee 12 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: i32x4.extract_lane 0 |
| ; CHECK-NEXT: local.set 7 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: local.get 13 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.tee 12 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: i32x4.extract_lane 0 |
| ; CHECK-NEXT: local.set 6 |
| ; CHECK-NEXT: local.get 14 |
| ; CHECK-NEXT: local.get 14 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.tee 12 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: i32x4.extract_lane 0 |
| ; CHECK-NEXT: local.set 5 |
| ; CHECK-NEXT: local.get 11 |
| ; CHECK-NEXT: local.get 11 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: local.tee 12 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: local.get 12 |
| ; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; CHECK-NEXT: i32x4.add |
| ; CHECK-NEXT: i32x4.extract_lane 0 |
| ; CHECK-NEXT: local.set 4 |
| ; CHECK-NEXT: .LBB2_5: # %scalar.ph |
| ; CHECK-NEXT: end_block # label10: |
| ; CHECK-NEXT: local.get 3 |
| ; CHECK-NEXT: local.get 8 |
| ; CHECK-NEXT: i32.sub |
| ; CHECK-NEXT: local.set 2 |
| ; CHECK-NEXT: .LBB2_6: # %bb41 |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: loop # label13: |
| ; CHECK-NEXT: local.get 9 |
| ; CHECK-NEXT: i32.const 4 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: i32.load16_s 0 |
| ; CHECK-NEXT: local.tee 1 |
| ; CHECK-NEXT: local.get 10 |
| ; CHECK-NEXT: i32.const 4 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: i32.load16_s 0 |
| ; CHECK-NEXT: local.tee 3 |
| ; CHECK-NEXT: i32.mul |
| ; CHECK-NEXT: local.get 7 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 7 |
| ; CHECK-NEXT: local.get 10 |
| ; CHECK-NEXT: i32.load16_s 0 |
| ; CHECK-NEXT: local.tee 8 |
| ; CHECK-NEXT: local.get 9 |
| ; CHECK-NEXT: i32.load16_s 0 |
| ; CHECK-NEXT: local.tee 21 |
| ; CHECK-NEXT: i32.mul |
| ; CHECK-NEXT: local.get 4 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 4 |
| ; CHECK-NEXT: local.get 1 |
| ; CHECK-NEXT: local.get 8 |
| ; CHECK-NEXT: i32.mul |
| ; CHECK-NEXT: local.get 6 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 6 |
| ; CHECK-NEXT: local.get 3 |
| ; CHECK-NEXT: local.get 21 |
| ; CHECK-NEXT: i32.mul |
| ; CHECK-NEXT: local.get 5 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 5 |
| ; CHECK-NEXT: local.get 9 |
| ; CHECK-NEXT: i32.const 8 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 9 |
| ; CHECK-NEXT: local.get 10 |
| ; CHECK-NEXT: i32.const 8 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.set 10 |
| ; CHECK-NEXT: local.get 2 |
| ; CHECK-NEXT: i32.const -1 |
| ; CHECK-NEXT: i32.add |
| ; CHECK-NEXT: local.tee 2 |
| ; CHECK-NEXT: br_if 0 # 0: up to label13 |
| ; CHECK-NEXT: # %bb.7: # %bb41.exit |
| ; CHECK-NEXT: end_loop |
| ; CHECK-NEXT: local.get 0 |
| ; CHECK-NEXT: local.get 7 |
| ; CHECK-NEXT: i32.store 12 |
| ; CHECK-NEXT: local.get 0 |
| ; CHECK-NEXT: local.get 6 |
| ; CHECK-NEXT: i32.store 8 |
| ; CHECK-NEXT: local.get 0 |
| ; CHECK-NEXT: local.get 5 |
| ; CHECK-NEXT: i32.store 4 |
| ; CHECK-NEXT: local.get 0 |
| ; CHECK-NEXT: local.get 4 |
| ; CHECK-NEXT: i32.store 0 |
| ; CHECK-NEXT: # fallthrough-return |
| ; |
| ; MAX-BANDWIDTH-LABEL: bb41_inner_loop_i16: |
| ; MAX-BANDWIDTH: .functype bb41_inner_loop_i16 (i32, i32, i32, i32, i32, i32, i32, i32) -> () |
| ; MAX-BANDWIDTH-NEXT: .local i32, i32, i32, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, i32 |
| ; MAX-BANDWIDTH-NEXT: # %bb.0: # %entry |
| ; MAX-BANDWIDTH-NEXT: i32.const 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 8 |
| ; MAX-BANDWIDTH-NEXT: block |
| ; MAX-BANDWIDTH-NEXT: block |
| ; MAX-BANDWIDTH-NEXT: local.get 3 |
| ; MAX-BANDWIDTH-NEXT: i32.const 9 |
| ; MAX-BANDWIDTH-NEXT: i32.ge_u |
| ; MAX-BANDWIDTH-NEXT: br_if 0 # 0: down to label11 |
| ; MAX-BANDWIDTH-NEXT: # %bb.1: |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.set 9 |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: local.set 10 |
| ; MAX-BANDWIDTH-NEXT: br 1 # 1: down to label10 |
| ; MAX-BANDWIDTH-NEXT: .LBB2_2: # %vector.ph |
| ; MAX-BANDWIDTH-NEXT: end_block # label11: |
| ; MAX-BANDWIDTH-NEXT: v128.const 0, 0, 0, 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 7 |
| ; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 12 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 5 |
| ; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 14 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 4 |
| ; MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: local.get 3 |
| ; MAX-BANDWIDTH-NEXT: local.get 3 |
| ; MAX-BANDWIDTH-NEXT: i32.const 7 |
| ; MAX-BANDWIDTH-NEXT: i32.and |
| ; MAX-BANDWIDTH-NEXT: local.tee 9 |
| ; MAX-BANDWIDTH-NEXT: i32.const 8 |
| ; MAX-BANDWIDTH-NEXT: local.get 9 |
| ; MAX-BANDWIDTH-NEXT: i32.select |
| ; MAX-BANDWIDTH-NEXT: i32.sub |
| ; MAX-BANDWIDTH-NEXT: local.tee 8 |
| ; MAX-BANDWIDTH-NEXT: i32.const 3 |
| ; MAX-BANDWIDTH-NEXT: i32.shl |
| ; MAX-BANDWIDTH-NEXT: local.tee 9 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 10 |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 9 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 9 |
| ; MAX-BANDWIDTH-NEXT: local.get 8 |
| ; MAX-BANDWIDTH-NEXT: local.set 7 |
| ; MAX-BANDWIDTH-NEXT: .LBB2_3: # %vector.body |
| ; MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; MAX-BANDWIDTH-NEXT: loop # label12: |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: v128.load 0:p2align=1 |
| ; MAX-BANDWIDTH-NEXT: local.tee 15 |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: v128.load 16:p2align=1 |
| ; MAX-BANDWIDTH-NEXT: local.tee 16 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: v128.load 32:p2align=1 |
| ; MAX-BANDWIDTH-NEXT: local.tee 17 |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: v128.load 48:p2align=1 |
| ; MAX-BANDWIDTH-NEXT: local.tee 18 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 12, 13, 20, 21, 28, 29 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 |
| ; MAX-BANDWIDTH-NEXT: local.tee 19 |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: v128.load 0:p2align=1 |
| ; MAX-BANDWIDTH-NEXT: local.tee 20 |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: v128.load 16:p2align=1 |
| ; MAX-BANDWIDTH-NEXT: local.tee 21 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: v128.load 32:p2align=1 |
| ; MAX-BANDWIDTH-NEXT: local.tee 22 |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: v128.load 48:p2align=1 |
| ; MAX-BANDWIDTH-NEXT: local.tee 23 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 12, 13, 20, 21, 28, 29 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 |
| ; MAX-BANDWIDTH-NEXT: local.tee 24 |
| ; MAX-BANDWIDTH-NEXT: i32x4.dot_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.set 12 |
| ; MAX-BANDWIDTH-NEXT: local.get 19 |
| ; MAX-BANDWIDTH-NEXT: local.get 20 |
| ; MAX-BANDWIDTH-NEXT: local.get 21 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 22 |
| ; MAX-BANDWIDTH-NEXT: local.get 23 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 16, 17, 24, 25 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 |
| ; MAX-BANDWIDTH-NEXT: local.tee 20 |
| ; MAX-BANDWIDTH-NEXT: i32x4.dot_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.set 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 24 |
| ; MAX-BANDWIDTH-NEXT: local.get 15 |
| ; MAX-BANDWIDTH-NEXT: local.get 16 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 17 |
| ; MAX-BANDWIDTH-NEXT: local.get 18 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 16, 17, 24, 25 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 |
| ; MAX-BANDWIDTH-NEXT: local.tee 15 |
| ; MAX-BANDWIDTH-NEXT: i32x4.dot_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: local.get 14 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.set 14 |
| ; MAX-BANDWIDTH-NEXT: local.get 20 |
| ; MAX-BANDWIDTH-NEXT: local.get 15 |
| ; MAX-BANDWIDTH-NEXT: i32x4.dot_i16x8_s |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.set 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: i32.const 64 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: i32.const 64 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 2 |
| ; MAX-BANDWIDTH-NEXT: local.get 7 |
| ; MAX-BANDWIDTH-NEXT: i32.const -8 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 7 |
| ; MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label12 |
| ; MAX-BANDWIDTH-NEXT: # %bb.4: # %middle.block |
| ; MAX-BANDWIDTH-NEXT: end_loop |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 12 |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 7 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 13 |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 12 |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 6 |
| ; MAX-BANDWIDTH-NEXT: local.get 14 |
| ; MAX-BANDWIDTH-NEXT: local.get 14 |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 12 |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 5 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 11 |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 12 |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: local.get 12 |
| ; MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; MAX-BANDWIDTH-NEXT: i32x4.add |
| ; MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; MAX-BANDWIDTH-NEXT: local.set 4 |
| ; MAX-BANDWIDTH-NEXT: .LBB2_5: # %scalar.ph |
| ; MAX-BANDWIDTH-NEXT: end_block # label10: |
| ; MAX-BANDWIDTH-NEXT: local.get 3 |
| ; MAX-BANDWIDTH-NEXT: local.get 8 |
| ; MAX-BANDWIDTH-NEXT: i32.sub |
| ; MAX-BANDWIDTH-NEXT: local.set 2 |
| ; MAX-BANDWIDTH-NEXT: .LBB2_6: # %bb41 |
| ; MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; MAX-BANDWIDTH-NEXT: loop # label13: |
| ; MAX-BANDWIDTH-NEXT: local.get 9 |
| ; MAX-BANDWIDTH-NEXT: i32.const 4 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: i32.load16_s 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 10 |
| ; MAX-BANDWIDTH-NEXT: i32.const 4 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: i32.load16_s 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 3 |
| ; MAX-BANDWIDTH-NEXT: i32.mul |
| ; MAX-BANDWIDTH-NEXT: local.get 7 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 7 |
| ; MAX-BANDWIDTH-NEXT: local.get 10 |
| ; MAX-BANDWIDTH-NEXT: i32.load16_s 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 8 |
| ; MAX-BANDWIDTH-NEXT: local.get 9 |
| ; MAX-BANDWIDTH-NEXT: i32.load16_s 0 |
| ; MAX-BANDWIDTH-NEXT: local.tee 25 |
| ; MAX-BANDWIDTH-NEXT: i32.mul |
| ; MAX-BANDWIDTH-NEXT: local.get 4 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 4 |
| ; MAX-BANDWIDTH-NEXT: local.get 1 |
| ; MAX-BANDWIDTH-NEXT: local.get 8 |
| ; MAX-BANDWIDTH-NEXT: i32.mul |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 6 |
| ; MAX-BANDWIDTH-NEXT: local.get 3 |
| ; MAX-BANDWIDTH-NEXT: local.get 25 |
| ; MAX-BANDWIDTH-NEXT: i32.mul |
| ; MAX-BANDWIDTH-NEXT: local.get 5 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 5 |
| ; MAX-BANDWIDTH-NEXT: local.get 9 |
| ; MAX-BANDWIDTH-NEXT: i32.const 8 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 9 |
| ; MAX-BANDWIDTH-NEXT: local.get 10 |
| ; MAX-BANDWIDTH-NEXT: i32.const 8 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.set 10 |
| ; MAX-BANDWIDTH-NEXT: local.get 2 |
| ; MAX-BANDWIDTH-NEXT: i32.const -1 |
| ; MAX-BANDWIDTH-NEXT: i32.add |
| ; MAX-BANDWIDTH-NEXT: local.tee 2 |
| ; MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label13 |
| ; MAX-BANDWIDTH-NEXT: # %bb.7: # %bb41.exit |
| ; MAX-BANDWIDTH-NEXT: end_loop |
| ; MAX-BANDWIDTH-NEXT: local.get 0 |
| ; MAX-BANDWIDTH-NEXT: local.get 7 |
| ; MAX-BANDWIDTH-NEXT: i32.store 12 |
| ; MAX-BANDWIDTH-NEXT: local.get 0 |
| ; MAX-BANDWIDTH-NEXT: local.get 6 |
| ; MAX-BANDWIDTH-NEXT: i32.store 8 |
| ; MAX-BANDWIDTH-NEXT: local.get 0 |
| ; MAX-BANDWIDTH-NEXT: local.get 5 |
| ; MAX-BANDWIDTH-NEXT: i32.store 4 |
| ; MAX-BANDWIDTH-NEXT: local.get 0 |
| ; MAX-BANDWIDTH-NEXT: local.get 4 |
| ; MAX-BANDWIDTH-NEXT: i32.store 0 |
| ; MAX-BANDWIDTH-NEXT: # fallthrough-return |
| ; |
| ; RELAXED-MAX-BANDWIDTH-LABEL: bb41_inner_loop_i16: |
| ; RELAXED-MAX-BANDWIDTH: .functype bb41_inner_loop_i16 (i32, i32, i32, i32, i32, i32, i32, i32) -> () |
| ; RELAXED-MAX-BANDWIDTH-NEXT: .local i32, i32, i32, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, v128, i32 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: # %bb.0: # %entry |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: block |
| ; RELAXED-MAX-BANDWIDTH-NEXT: block |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.ge_u |
| ; RELAXED-MAX-BANDWIDTH-NEXT: br_if 0 # 0: down to label11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: # %bb.1: |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: br 1 # 1: down to label10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: .LBB2_2: # %vector.ph |
| ; RELAXED-MAX-BANDWIDTH-NEXT: end_block # label11: |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.const 0, 0, 0, 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 5 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 14 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.replace_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.and |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.select |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.sub |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.shl |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: .LBB2_3: # %vector.body |
| ; RELAXED-MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: loop # label12: |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 0:p2align=1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 15 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 16:p2align=1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 16 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 32:p2align=1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 17 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 48:p2align=1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 18 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 12, 13, 20, 21, 28, 29 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 19 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 0:p2align=1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 20 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 16:p2align=1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 21 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 32:p2align=1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 22 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: v128.load 48:p2align=1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 23 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 12, 13, 20, 21, 28, 29 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 24 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.dot_i16x8_s |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 19 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 20 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 21 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 22 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 23 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 16, 17, 24, 25 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 20 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.dot_i16x8_s |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 24 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 15 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 16 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 17 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 18 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 16, 17, 24, 25 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 15 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.dot_i16x8_s |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 14 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 14 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 20 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 15 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.dot_i16x8_s |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 64 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 64 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const -8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: # %bb.4: # %middle.block |
| ; RELAXED-MAX-BANDWIDTH-NEXT: end_loop |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 14 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 14 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 5 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 11 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32x4.extract_lane 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: .LBB2_5: # %scalar.ph |
| ; RELAXED-MAX-BANDWIDTH-NEXT: end_block # label10: |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.sub |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: .LBB2_6: # %bb41 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: loop # label13: |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.load16_s 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.load16_s 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.load16_s 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.load16_s 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 25 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 3 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 25 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.mul |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 5 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 5 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 9 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.set 10 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.const -1 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.add |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.tee 2 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: br_if 0 # 0: up to label13 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: # %bb.7: # %bb41.exit |
| ; RELAXED-MAX-BANDWIDTH-NEXT: end_loop |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 7 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 12 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 6 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 8 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 5 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: local.get 4 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: i32.store 0 |
| ; RELAXED-MAX-BANDWIDTH-NEXT: # fallthrough-return |
| entry: |
| br label %bb41 |
| |
| bb41: |
| %idx = phi i32 [ 0, %entry ], [ %idx.next, %bb41 ] |
| %lhs.ptr = phi ptr [ %lhs, %entry ], [ %lhs.next, %bb41 ] |
| %rhs.ptr = phi ptr [ %rhs, %entry ], [ %rhs.next, %bb41 ] |
| %acc00.phi = phi i32 [ %acc00, %entry ], [ %acc00.next, %bb41 ] |
| %acc01.phi = phi i32 [ %acc01, %entry ], [ %acc01.next, %bb41 ] |
| %acc10.phi = phi i32 [ %acc10, %entry ], [ %acc10.next, %bb41 ] |
| %acc11.phi = phi i32 [ %acc11, %entry ], [ %acc11.next, %bb41 ] |
| %lhs0 = load i16, ptr %lhs.ptr, align 2 |
| %lhs0.sext = sext i16 %lhs0 to i32 |
| %rhs0 = load i16, ptr %rhs.ptr, align 2 |
| %rhs0.sext = sext i16 %rhs0 to i32 |
| %mul00 = mul nsw i32 %rhs0.sext, %lhs0.sext |
| %acc00.next = add nsw i32 %mul00, %acc00.phi |
| %rhs1.ptr = getelementptr inbounds nuw i16, ptr %rhs.ptr, i32 2 |
| %rhs1 = load i16, ptr %rhs1.ptr, align 2 |
| %rhs1.sext = sext i16 %rhs1 to i32 |
| %mul01 = mul nsw i32 %rhs1.sext, %lhs0.sext |
| %acc01.next = add nsw i32 %mul01, %acc01.phi |
| %lhs1.ptr = getelementptr inbounds nuw i16, ptr %lhs.ptr, i32 2 |
| %lhs1 = load i16, ptr %lhs1.ptr, align 2 |
| %lhs1.sext = sext i16 %lhs1 to i32 |
| %mul10 = mul nsw i32 %lhs1.sext, %rhs0.sext |
| %acc10.next = add nsw i32 %mul10, %acc10.phi |
| %mul11 = mul nsw i32 %lhs1.sext, %rhs1.sext |
| %acc11.next = add nsw i32 %mul11, %acc11.phi |
| %lhs.next = getelementptr inbounds nuw i16, ptr %lhs.ptr, i32 4 |
| %rhs.next = getelementptr inbounds nuw i16, ptr %rhs.ptr, i32 4 |
| %idx.next = add nuw nsw i32 %idx, 1 |
| %exit = icmp eq i32 %idx.next, %len |
| br i1 %exit, label %bb41.exit, label %bb41 |
| |
| bb41.exit: |
| %res0 = insertvalue { i32, i32, i32, i32 } poison, i32 %acc00.next, 0 |
| %res1 = insertvalue { i32, i32, i32, i32 } %res0, i32 %acc01.next, 1 |
| %res2 = insertvalue { i32, i32, i32, i32 } %res1, i32 %acc10.next, 2 |
| %res3 = insertvalue { i32, i32, i32, i32 } %res2, i32 %acc11.next, 3 |
| ret { i32, i32, i32, i32 } %res3 |
| } |