mlir/test/Dialect/Linalg/tile-and-fuse-on-tensors.mlir - llvm-project - Git at Google

 // RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul fuse tile-sizes=5,4,7 tile-interchange=1,0,2 run-enable-pass=false" -cse -split-input-file | FileCheck --check-prefix=MATMUL %s
 // RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.generic fuse tile-sizes=5,4,7 tile-interchange=1,0,2 run-enable-pass=false" -cse -split-input-file | FileCheck --check-prefix=GENERIC %s

 //  MATMUL-DAG:  #[[MAP0:.*]] = affine_map<(d0) -> (5, -d0 + 24)>
 //  MATMUL-DAG:  #[[MAP1:.*]] = affine_map<(d0) -> (7, -d0 + 12)>
 //  MATMUL-DAG:  #[[MAP2:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 24)>
 //  MATMUL-DAG:  #[[MAP3:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 12)>

 //      MATMUL:  fuse_input
 // MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
 builtin.func @fuse_input(%arg0: tensor<24x12xf32>,
                          %arg1: tensor<12x25xf32>,
                          %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
   %c0 = arith.constant 0 : index
   %c12 = arith.constant 12 : index
   %c25 = arith.constant 25 : index
   %c24 = arith.constant 24 : index
   %c4 = arith.constant 4 : index
   %cst = arith.constant 0.000000e+00 : f32
   %0 = linalg.fill(%cst, %arg0) : f32, tensor<24x12xf32> -> tensor<24x12xf32>

   //      MATMUL:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
   //      MATMUL:    scf.for %[[IV1:[0-9a-zA-Z]*]] =
   //      MATMUL:      %[[TS1:.*]] = affine.min #[[MAP0]](%[[IV1]])
   //      MATMUL:      scf.for %[[IV2:[0-9a-zA-Z]*]] =
   //      MATMUL:        %[[TS2:.*]] = affine.min #[[MAP1]](%[[IV2]])

   // Tile both input operand dimensions.
   //      MATMUL:        %[[UB1:.*]] = affine.min #[[MAP2]](%[[TS1]], %[[IV1]])
   //      MATMUL:        %[[UB2:.*]] = affine.min #[[MAP3]](%[[TS2]], %[[IV2]])
   //      MATMUL:        %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
   // MATMUL-SAME:                                          %[[IV1]], %[[IV2]]
   // MATMUL-SAME:                                          %[[UB1]], %[[UB2]]
   //      MATMUL:        %[[T1:.*]] = linalg.fill(%{{.*}}, %[[T0]])
   //      MATMUL:        %{{.*}} = linalg.matmul ins(%[[T1]]
   %1 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
   return %1 : tensor<24x25xf32>
 }

 // -----

 //  MATMUL-DAG:  #[[MAP0:.*]] = affine_map<(d0) -> (5, -d0 + 24)>
 //  MATMUL-DAG:  #[[MAP1:.*]] = affine_map<(d0) -> (4, -d0 + 25)>

 //      MATMUL:  fuse_output
 // MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
 builtin.func @fuse_output(%arg0: tensor<24x12xf32>,
                           %arg1: tensor<12x25xf32>,
                           %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
   //  MATMUL-DAG:  %[[C0:.*]] = arith.constant 0 : index
   //  MATMUL-DAG:  %[[C1:.*]] = arith.constant 1 : index
   %c0 = arith.constant 0 : index
   %c12 = arith.constant 12 : index
   %c25 = arith.constant 25 : index
   %c24 = arith.constant 24 : index
   %c4 = arith.constant 4 : index
   %cst = arith.constant 0.000000e+00 : f32
   %0 = linalg.fill(%cst, %arg2) : f32, tensor<24x25xf32> -> tensor<24x25xf32>

   // Update the iteration argument of the outermost tile loop.
   //      MATMUL:  scf.for %[[IV0:.*]] = {{.*}} iter_args(%[[ARG3:.*]] = %[[ARG2]]
   //      MATMUL:    scf.for %[[IV1:.*]] = {{.*}} iter_args(%[[ARG4:.*]] = %[[ARG3]]
   //      MATMUL:      %[[TS1:.*]] = affine.min #[[MAP0]](%[[IV1]])
   //      MATMUL:      %[[TS0:.*]] = affine.min #[[MAP1]](%[[IV0]])

   // Tile the both output operand dimensions.
   //      MATMUL:      %[[T0:.*]] = tensor.extract_slice %[[ARG4]]
   // MATMUL-SAME:                                        %[[IV1]], %[[IV0]]
   // MATMUL-SAME:                                        %[[TS1]], %[[TS0]]
   //      MATMUL:      %[[T1:.*]] = linalg.fill(%{{.*}}, %[[T0]])
   //      MATMUL:        scf.for %[[IV2:.*]] = {{.*}} iter_args(%[[ARG5:.*]] = %[[T1]]

   // Check there is an extract/insert slice pair for the output operand.
   //  MATMUL-DAG:          %[[D0:.*]] = tensor.dim %[[ARG5]], %[[C0]]
   //  MATMUL-DAG:          %[[D1:.*]] = tensor.dim %[[ARG5]], %[[C1]]
   //      MATMUL:          %[[T2:.*]] = tensor.extract_slice %[[ARG5]]
   // MATMUL-SAME:                                            0, 0
   // MATMUL-SAME:                                            %[[D0]], %[[D1]]
   //      MATMUL:          %[[T3:.*]] = linalg.matmul {{.*}} outs(%[[T2]]
   //      MATMUL:          %{{.*}} = tensor.insert_slice %[[T3]] into %[[ARG5]]
   // MATMUL-SAME:                                            0, 0
   // MATMUL-SAME:                                            %[[D0]], %[[D1]]
   %1 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%0 : tensor<24x25xf32>) -> tensor<24x25xf32>
   return %1 : tensor<24x25xf32>
 }

 // -----

 //  MATMUL-DAG:  #[[MAP0:.*]] = affine_map<(d0) -> (4, -d0 + 25)>
 //  MATMUL-DAG:  #[[MAP1:.*]] = affine_map<(d0) -> (7, -d0 + 12)>
 //  MATMUL-DAG:  #[[MAP2:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 25)>
 //  MATMUL-DAG:  #[[MAP3:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 12)>
 #map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d2)>

 //      MATMUL:  fuse_reduction
 // MATMUL-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>
 // MATMUL-SAME:    %[[ARG3:[0-9a-zA-Z]*]]: tensor<12x7x25xf32>
 builtin.func @fuse_reduction(%arg0: tensor<24x12xf32>,
                              %arg1: tensor<12x25xf32>,
                              %arg2: tensor<24x25xf32>,
                              %arg3: tensor<12x7x25xf32>) -> tensor<24x25xf32> {
   %c0 = arith.constant 0 : index
   %c12 = arith.constant 12 : index
   %c25 = arith.constant 25 : index
   %c24 = arith.constant 24 : index
   %c4 = arith.constant 4 : index
   %0 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg3 : tensor<12x7x25xf32>) outs(%arg1 : tensor<12x25xf32>) {
   ^bb0(%arg4: f32, %arg5: f32):  // no predecessors
     %2 = arith.addf %arg4, %arg5 : f32
     linalg.yield %2 : f32
   } -> tensor<12x25xf32>

   //      MATMUL:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
   //      MATMUL:    scf.for %[[IV1:[0-9a-zA-Z]*]] =
   //      MATMUL:      %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]])
   //      MATMUL:      scf.for %[[IV2:[0-9a-zA-Z]*]] =
   //      MATMUL:        %[[TS2:.*]] = affine.min #[[MAP1]](%[[IV2]])
   //      MATMUL:        %[[UB2:.*]] = affine.min #[[MAP3]](%[[TS2]], %[[IV2]])
   //      MATMUL:        %[[UB0:.*]] = affine.min #[[MAP2]](%[[TS0]], %[[IV0]])

   // Tile only the parallel dimensions but not the reduction dimension.
   //      MATMUL:        %[[T0:.*]] = tensor.extract_slice %[[ARG3]]
   // MATMUL-SAME:                                          %[[IV2]], 0, %[[IV0]]
   // MATMUL-SAME:                                          %[[UB2]], 7, %[[UB0]]
   //      MATMUL:        %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
   // MATMUL-SAME:                                          %[[IV2]], %[[IV0]]
   // MATMUL-SAME:                                          %[[UB2]], %[[UB0]]
   //      MATMUL:        %[[T2:.*]] = linalg.generic {{.*}} ins(%[[T0]] {{.*}} outs(%[[T1]]
   //      MATMUL:        %{{.*}} = linalg.matmul ins(%{{.*}}, %[[T2]]
   %1 = linalg.matmul ins(%arg0, %0 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
   return %1 : tensor<24x25xf32>
 }

 // -----

 #map0 = affine_map<(d0, d1) -> (d1, d0)>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>

 //      MATMUL:  fuse_transposed
 // MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
 // MATMUL-SAME:    %[[ARG3:[0-9a-zA-Z]*]]: tensor<12x24xf32>
 builtin.func @fuse_transposed(%arg0: tensor<24x12xf32>,
                               %arg1: tensor<12x25xf32>,
                               %arg2: tensor<24x25xf32>,
                               %arg3: tensor<12x24xf32>) -> tensor<24x25xf32> {
   %c0 = arith.constant 0 : index
   %c12 = arith.constant 12 : index
   %c25 = arith.constant 25 : index
   %c24 = arith.constant 24 : index
   %c4 = arith.constant 4 : index
   %0 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg3 : tensor<12x24xf32>) outs(%arg0 : tensor<24x12xf32>) {
   ^bb0(%arg4: f32, %arg5: f32):  // no predecessors
     %2 = arith.addf %arg4, %arg5 : f32
     linalg.yield %2 : f32
   } -> tensor<24x12xf32>

   //      MATMUL:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
   //      MATMUL:    scf.for %[[IV1:[0-9a-zA-Z]*]] =
   //      MATMUL:      scf.for %[[IV2:[0-9a-zA-Z]*]] =

   // Swap the input operand slice offsets due to the transposed indexing map.
   //      MATMUL:        %[[T0:.*]] = tensor.extract_slice %[[ARG3]]
   // MATMUL-SAME:                                          %[[IV2]], %[[IV1]]
   //      MATMUL:        %[[T1:.*]] = tensor.extract_slice %[[ARG0]]
   // MATMUL-SAME:                                          %[[IV1]], %[[IV2]]
   //      MATMUL:        %[[T2:.*]] = linalg.generic {{.*}} ins(%[[T0]] {{.*}} outs(%[[T1]]
   //      MATMUL:        %{{.*}} = linalg.matmul ins(%[[T2]]
   %1 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
   return %1 : tensor<24x25xf32>
 }

 // -----

 //      MATMUL:  fuse_input_and_output
 // MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
 // MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
 builtin.func @fuse_input_and_output(%arg0: tensor<24x12xf32>,
                                     %arg1: tensor<12x25xf32>,
                                     %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
   %c0 = arith.constant 0 : index
   %c12 = arith.constant 12 : index
   %c25 = arith.constant 25 : index
   %c24 = arith.constant 24 : index
   %c4 = arith.constant 4 : index
   %cst = arith.constant 0.000000e+00 : f32
   %0 = linalg.fill(%cst, %arg0) : f32, tensor<24x12xf32> -> tensor<24x12xf32>
   %1 = linalg.fill(%cst, %arg2) : f32, tensor<24x25xf32> -> tensor<24x25xf32>

   // Fuse both producers to the appropriate tile loops.
   //      MATMUL:  scf.for %[[IV0:.*]] = {{.*}} iter_args(%[[ARG3:.*]] = %[[ARG2]]
   //      MATMUL:    scf.for %[[IV1:.*]] = {{.*}} iter_args(%[[ARG4:.*]] = %[[ARG3]]
   //      MATMUL:      %[[T0:.*]] = tensor.extract_slice %[[ARG4]]
   // MATMUL-SAME:                                        %[[IV1]], %[[IV0]]
   //      MATMUL:      %[[T1:.*]] = linalg.fill(%{{.*}}, %[[T0]])
   //      MATMUL:        scf.for %[[IV2:.*]] = {{.*}} iter_args(%[[ARG5:.*]] = %[[T1]]
   //      MATMUL:          %[[T2:.*]] = tensor.extract_slice %[[ARG0]]
   // MATMUL-SAME:                                            %[[IV1]], %[[IV2]]
   //      MATMUL:          %[[T3:.*]] = linalg.fill(%{{.*}}, %[[T2]])
   //      MATMUL:          %[[T4:.*]] = tensor.extract_slice %[[ARG5]]
   //      MATMUL:          %{{.*}} = linalg.matmul ins(%[[T3]], {{.*}} outs(%[[T4]]
   %2 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%1 : tensor<24x25xf32>) -> tensor<24x25xf32>
   return %2 : tensor<24x25xf32>
 }

 // -----

 //  MATMUL-DAG:  #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0 + d1)>
 #map0 = affine_map<(d0, d1) -> (d1, d0)>

 //      MATMUL:  fuse_indexed
 // MATMUL-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xi32>
 builtin.func @fuse_indexed(%arg0: tensor<24x12xi32>,
                            %arg1: tensor<12x25xi32>,
                            %arg2: tensor<24x25xi32>) -> tensor<24x25xi32> {
   %c0 = arith.constant 0 : index
   %c12 = arith.constant 12 : index
   %c25 = arith.constant 25 : index
   %c24 = arith.constant 24 : index
   %c4 = arith.constant 4 : index
   %0 = linalg.generic {indexing_maps = [#map0], iterator_types = ["parallel", "parallel"]} outs(%arg1 : tensor<12x25xi32>) {
   ^bb0(%arg3: i32):  // no predecessors
     %6 = linalg.index 0 : index
     %7 = linalg.index 1 : index
     %8 = arith.addi %6, %7 : index
     %9 = arith.index_cast %8 : index to i32
     linalg.yield %9 : i32
   } -> tensor<12x25xi32>

   //      MATMUL:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
   //      MATMUL:    scf.for %[[IV1:[0-9a-zA-Z]*]] =
   //      MATMUL:      scf.for %[[IV2:[0-9a-zA-Z]*]] =

   // Shift the indexes by the slice offsets and swap the offsets due to the transposed indexing map.
   //      MATMUL:        %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
   // MATMUL-SAME:                                          %[[IV2]], %[[IV0]]
   //      MATMUL:  linalg.generic {{.*}} outs(%[[T1]]
   //      MATMUL:  %[[IDX0:.*]] = linalg.index 0
   //      MATMUL:  %[[IDX0_SHIFTED:.*]] = affine.apply #[[MAP0]](%[[IDX0]], %[[IV0]])
   //      MATMUL:  %[[IDX1:.*]] = linalg.index 1
   //      MATMUL:  %[[IDX1_SHIFTED:.*]] = affine.apply #[[MAP0]](%[[IDX1]], %[[IV2]])
   //      MATMUL:  %{{.*}} = arith.addi %[[IDX0_SHIFTED]], %[[IDX1_SHIFTED]]
   %1 = linalg.matmul ins(%arg0, %0 : tensor<24x12xi32>, tensor<12x25xi32>) outs(%arg2 : tensor<24x25xi32>) -> tensor<24x25xi32>
   return %1 : tensor<24x25xi32>
 }

 // -----

 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d0)>

 //      GENERIC:  fuse_outermost_reduction
 // GENERIC-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<10x17xf32>
 // GENERIC-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<10xf32>
 func @fuse_outermost_reduction(%arg0: tensor<10x17xf32>,
                                %arg1: tensor<10xf32>) -> tensor<10xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = linalg.fill(%cst, %arg0) : f32, tensor<10x17xf32> -> tensor<10x17xf32>

   // Cannot fuse the output fill since the reduction loop is the outermost loop.
   //      GENERIC:      %[[T0:.*]] = linalg.fill(%{{.*}}, %[[ARG1]])
   %1 = linalg.fill(%cst, %arg1) : f32, tensor<10xf32> -> tensor<10xf32>

   //      GENERIC:  scf.for %[[IV0:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG2:.*]] = %[[T0]]
   //      GENERIC:    scf.for %[[IV1:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG3:.*]] = %[[ARG2]]

   // MATMUL the input fill has been fused.
   //      GENERIC:      %[[T1:.*]] = tensor.extract_slice %[[ARG0]]
   // GENERIC-SAME:                                        %[[IV1]], %[[IV0]]
   //      GENERIC:      %[[T2:.*]] = linalg.fill(%{{.*}}, %[[T1]])
   //      GENERIC:      %[[T3:.*]] = tensor.extract_slice %[[ARG3]]
   // GENERIC-SAME:                                        %[[IV1]]
   //      GENERIC:  linalg.generic {{.*}} ins(%[[T2]] {{.*}} outs(%[[T3]]
   %2 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "reduction"]} ins(%0 : tensor<10x17xf32>) outs(%1 : tensor<10xf32>) {
   ^bb0(%arg2: f32, %arg3: f32):  // no predecessors
     %3 = arith.addf %arg2, %arg3 : f32
     linalg.yield %3 : f32
   } -> tensor<10xf32>
   return %2 : tensor<10xf32>
 }

 // -----

 //  GENERIC-DAG:  #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0 + d1)>
 //  GENERIC-DAG:  #[[MAP1:.*]] = affine_map<(d0, d1) -> (8, -d0 - d1 + 17)>
 //  GENERIC-DAG:  #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (d0, -d1 - d2 + 17)>
 #map0 = affine_map<(d0, d1) -> (d0, d0 + d1)>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>

 //      GENERIC:  fuse_non_rectangular
 // GENERIC-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<10x17xf32>
 func @fuse_non_rectangular(%arg0: tensor<10x17xf32>,
                            %arg1: tensor<10x8xf32>) -> tensor<10x8xf32> {

   //  GENERIC-DAG:  %[[C0:.*]] = arith.constant 0 : index
   //  GENERIC-DAG:  %[[C4:.*]] = arith.constant 4 : index
   //  GENERIC-DAG:  %[[C5:.*]] = arith.constant 5 : index
   //  GENERIC-DAG:  %[[C8:.*]] = arith.constant 8 : index
   //  GENERIC-DAG:  %[[C10:.*]] = arith.constant 10 : index
   %cst = arith.constant 0.000000e+00 : f32
   %0 = linalg.fill(%cst, %arg0) : f32, tensor<10x17xf32> -> tensor<10x17xf32>

   //      GENERIC:  scf.for %[[IV0:[0-9a-zA-Z]*]] = %[[C0]] to %[[C8]] step %[[C4]]
   //      GENERIC:    scf.for %[[IV1:[0-9a-zA-Z]*]] = %[[C0]] to %[[C10]] step %[[C5]]

   // Compute producer on a hyper rectangular bounding box. Along the second dimenson,
   // the offset is set to the sum of the induction variables, and the upper bound
   // to either 8 (tile size) or 17 (sum of max indices (9+7) then + 1) minus the
   // induction variables.
   //  GENERIC-DAG:      %[[SUM:.*]] = affine.apply #[[MAP0]](%[[IV1]], %[[IV0]]
   //  GENERIC-DAG:      %[[TS1:.*]] = affine.min #[[MAP1]](%[[IV1]], %[[IV0]]
   //  GENERIC-DAG:      %[[UB1:.*]] = affine.min #[[MAP2]](%[[TS1]], %[[IV1]], %[[IV0]]
   //      GENERIC:      %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
   // GENERIC-SAME:                                        %[[IV1]], %[[SUM]]
   // GENERIC-SAME:                                                , %[[UB1]]
   //      GENERIC:      %[[T1:.*]] = linalg.fill(%{{.*}}, %[[T0]])
   %1 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x17xf32>) outs(%arg1 : tensor<10x8xf32>) {
   ^bb0(%arg2: f32, %arg3: f32):  // no predecessors
     %2 = arith.addf %arg2, %arg3 : f32
     linalg.yield %2 : f32
   } -> tensor<10x8xf32>
   return %1 : tensor<10x8xf32>
 }
	// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul fuse tile-sizes=5,4,7 tile-interchange=1,0,2 run-enable-pass=false" -cse -split-input-file \| FileCheck --check-prefix=MATMUL %s
	// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.generic fuse tile-sizes=5,4,7 tile-interchange=1,0,2 run-enable-pass=false" -cse -split-input-file \| FileCheck --check-prefix=GENERIC %s

	// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (5, -d0 + 24)>
	// MATMUL-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (7, -d0 + 12)>
	// MATMUL-DAG: #[[MAP2:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 24)>
	// MATMUL-DAG: #[[MAP3:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 12)>

	// MATMUL: fuse_input
	// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
	builtin.func @fuse_input(%arg0: tensor<24x12xf32>,
	%arg1: tensor<12x25xf32>,
	%arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
	%c0 = arith.constant 0 : index
	%c12 = arith.constant 12 : index
	%c25 = arith.constant 25 : index
	%c24 = arith.constant 24 : index
	%c4 = arith.constant 4 : index
	%cst = arith.constant 0.000000e+00 : f32
	%0 = linalg.fill(%cst, %arg0) : f32, tensor<24x12xf32> -> tensor<24x12xf32>

	// MATMUL: scf.for %[[IV0:[0-9a-zA-Z]*]] =
	// MATMUL: scf.for %[[IV1:[0-9a-zA-Z]*]] =
	// MATMUL: %[[TS1:.*]] = affine.min #[[MAP0]](%[[IV1]])
	// MATMUL: scf.for %[[IV2:[0-9a-zA-Z]*]] =
	// MATMUL: %[[TS2:.*]] = affine.min #[[MAP1]](%[[IV2]])

	// Tile both input operand dimensions.
	// MATMUL: %[[UB1:.*]] = affine.min #[[MAP2]](%[[TS1]], %[[IV1]])
	// MATMUL: %[[UB2:.*]] = affine.min #[[MAP3]](%[[TS2]], %[[IV2]])
	// MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
	// MATMUL-SAME: %[[IV1]], %[[IV2]]
	// MATMUL-SAME: %[[UB1]], %[[UB2]]
	// MATMUL: %[[T1:.]] = linalg.fill(%{{.}}, %[[T0]])
	// MATMUL: %{{.*}} = linalg.matmul ins(%[[T1]]
	%1 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
	return %1 : tensor<24x25xf32>
	}

	// -----

	// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (5, -d0 + 24)>
	// MATMUL-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (4, -d0 + 25)>

	// MATMUL: fuse_output
	// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
	builtin.func @fuse_output(%arg0: tensor<24x12xf32>,
	%arg1: tensor<12x25xf32>,
	%arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
	// MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index
	// MATMUL-DAG: %[[C1:.*]] = arith.constant 1 : index
	%c0 = arith.constant 0 : index
	%c12 = arith.constant 12 : index
	%c25 = arith.constant 25 : index
	%c24 = arith.constant 24 : index
	%c4 = arith.constant 4 : index
	%cst = arith.constant 0.000000e+00 : f32
	%0 = linalg.fill(%cst, %arg2) : f32, tensor<24x25xf32> -> tensor<24x25xf32>

	// Update the iteration argument of the outermost tile loop.
	// MATMUL: scf.for %[[IV0:.]] = {{.}} iter_args(%[[ARG3:.*]] = %[[ARG2]]
	// MATMUL: scf.for %[[IV1:.]] = {{.}} iter_args(%[[ARG4:.*]] = %[[ARG3]]
	// MATMUL: %[[TS1:.*]] = affine.min #[[MAP0]](%[[IV1]])
	// MATMUL: %[[TS0:.*]] = affine.min #[[MAP1]](%[[IV0]])

	// Tile the both output operand dimensions.
	// MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG4]]
	// MATMUL-SAME: %[[IV1]], %[[IV0]]
	// MATMUL-SAME: %[[TS1]], %[[TS0]]
	// MATMUL: %[[T1:.]] = linalg.fill(%{{.}}, %[[T0]])
	// MATMUL: scf.for %[[IV2:.]] = {{.}} iter_args(%[[ARG5:.*]] = %[[T1]]

	// Check there is an extract/insert slice pair for the output operand.
	// MATMUL-DAG: %[[D0:.*]] = tensor.dim %[[ARG5]], %[[C0]]
	// MATMUL-DAG: %[[D1:.*]] = tensor.dim %[[ARG5]], %[[C1]]
	// MATMUL: %[[T2:.*]] = tensor.extract_slice %[[ARG5]]
	// MATMUL-SAME: 0, 0
	// MATMUL-SAME: %[[D0]], %[[D1]]
	// MATMUL: %[[T3:.]] = linalg.matmul {{.}} outs(%[[T2]]
	// MATMUL: %{{.*}} = tensor.insert_slice %[[T3]] into %[[ARG5]]
	// MATMUL-SAME: 0, 0
	// MATMUL-SAME: %[[D0]], %[[D1]]
	%1 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%0 : tensor<24x25xf32>) -> tensor<24x25xf32>
	return %1 : tensor<24x25xf32>
	}

	// -----

	// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (4, -d0 + 25)>
	// MATMUL-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (7, -d0 + 12)>
	// MATMUL-DAG: #[[MAP2:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 25)>
	// MATMUL-DAG: #[[MAP3:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 12)>
	#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
	#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>

	// MATMUL: fuse_reduction
	// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>
	// MATMUL-SAME: %[[ARG3:[0-9a-zA-Z]*]]: tensor<12x7x25xf32>
	builtin.func @fuse_reduction(%arg0: tensor<24x12xf32>,
	%arg1: tensor<12x25xf32>,
	%arg2: tensor<24x25xf32>,
	%arg3: tensor<12x7x25xf32>) -> tensor<24x25xf32> {
	%c0 = arith.constant 0 : index
	%c12 = arith.constant 12 : index
	%c25 = arith.constant 25 : index
	%c24 = arith.constant 24 : index
	%c4 = arith.constant 4 : index
	%0 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg3 : tensor<12x7x25xf32>) outs(%arg1 : tensor<12x25xf32>) {
	^bb0(%arg4: f32, %arg5: f32): // no predecessors
	%2 = arith.addf %arg4, %arg5 : f32
	linalg.yield %2 : f32
	} -> tensor<12x25xf32>

	// MATMUL: scf.for %[[IV0:[0-9a-zA-Z]*]] =
	// MATMUL: scf.for %[[IV1:[0-9a-zA-Z]*]] =
	// MATMUL: %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]])
	// MATMUL: scf.for %[[IV2:[0-9a-zA-Z]*]] =
	// MATMUL: %[[TS2:.*]] = affine.min #[[MAP1]](%[[IV2]])
	// MATMUL: %[[UB2:.*]] = affine.min #[[MAP3]](%[[TS2]], %[[IV2]])
	// MATMUL: %[[UB0:.*]] = affine.min #[[MAP2]](%[[TS0]], %[[IV0]])

	// Tile only the parallel dimensions but not the reduction dimension.
	// MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG3]]
	// MATMUL-SAME: %[[IV2]], 0, %[[IV0]]
	// MATMUL-SAME: %[[UB2]], 7, %[[UB0]]
	// MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
	// MATMUL-SAME: %[[IV2]], %[[IV0]]
	// MATMUL-SAME: %[[UB2]], %[[UB0]]
	// MATMUL: %[[T2:.]] = linalg.generic {{.}} ins(%[[T0]] {{.*}} outs(%[[T1]]
	// MATMUL: %{{.}} = linalg.matmul ins(%{{.}}, %[[T2]]
	%1 = linalg.matmul ins(%arg0, %0 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
	return %1 : tensor<24x25xf32>
	}

	// -----

	#map0 = affine_map<(d0, d1) -> (d1, d0)>
	#map1 = affine_map<(d0, d1) -> (d0, d1)>

	// MATMUL: fuse_transposed
	// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
	// MATMUL-SAME: %[[ARG3:[0-9a-zA-Z]*]]: tensor<12x24xf32>
	builtin.func @fuse_transposed(%arg0: tensor<24x12xf32>,
	%arg1: tensor<12x25xf32>,
	%arg2: tensor<24x25xf32>,
	%arg3: tensor<12x24xf32>) -> tensor<24x25xf32> {
	%c0 = arith.constant 0 : index
	%c12 = arith.constant 12 : index
	%c25 = arith.constant 25 : index
	%c24 = arith.constant 24 : index
	%c4 = arith.constant 4 : index
	%0 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg3 : tensor<12x24xf32>) outs(%arg0 : tensor<24x12xf32>) {
	^bb0(%arg4: f32, %arg5: f32): // no predecessors
	%2 = arith.addf %arg4, %arg5 : f32
	linalg.yield %2 : f32
	} -> tensor<24x12xf32>

	// MATMUL: scf.for %[[IV0:[0-9a-zA-Z]*]] =
	// MATMUL: scf.for %[[IV1:[0-9a-zA-Z]*]] =
	// MATMUL: scf.for %[[IV2:[0-9a-zA-Z]*]] =

	// Swap the input operand slice offsets due to the transposed indexing map.
	// MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG3]]
	// MATMUL-SAME: %[[IV2]], %[[IV1]]
	// MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG0]]
	// MATMUL-SAME: %[[IV1]], %[[IV2]]
	// MATMUL: %[[T2:.]] = linalg.generic {{.}} ins(%[[T0]] {{.*}} outs(%[[T1]]
	// MATMUL: %{{.*}} = linalg.matmul ins(%[[T2]]
	%1 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
	return %1 : tensor<24x25xf32>
	}

	// -----

	// MATMUL: fuse_input_and_output
	// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
	// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
	builtin.func @fuse_input_and_output(%arg0: tensor<24x12xf32>,
	%arg1: tensor<12x25xf32>,
	%arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
	%c0 = arith.constant 0 : index
	%c12 = arith.constant 12 : index
	%c25 = arith.constant 25 : index
	%c24 = arith.constant 24 : index
	%c4 = arith.constant 4 : index
	%cst = arith.constant 0.000000e+00 : f32
	%0 = linalg.fill(%cst, %arg0) : f32, tensor<24x12xf32> -> tensor<24x12xf32>
	%1 = linalg.fill(%cst, %arg2) : f32, tensor<24x25xf32> -> tensor<24x25xf32>

	// Fuse both producers to the appropriate tile loops.
	// MATMUL: scf.for %[[IV0:.]] = {{.}} iter_args(%[[ARG3:.*]] = %[[ARG2]]
	// MATMUL: scf.for %[[IV1:.]] = {{.}} iter_args(%[[ARG4:.*]] = %[[ARG3]]
	// MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG4]]
	// MATMUL-SAME: %[[IV1]], %[[IV0]]
	// MATMUL: %[[T1:.]] = linalg.fill(%{{.}}, %[[T0]])
	// MATMUL: scf.for %[[IV2:.]] = {{.}} iter_args(%[[ARG5:.*]] = %[[T1]]
	// MATMUL: %[[T2:.*]] = tensor.extract_slice %[[ARG0]]
	// MATMUL-SAME: %[[IV1]], %[[IV2]]
	// MATMUL: %[[T3:.]] = linalg.fill(%{{.}}, %[[T2]])
	// MATMUL: %[[T4:.*]] = tensor.extract_slice %[[ARG5]]
	// MATMUL: %{{.}} = linalg.matmul ins(%[[T3]], {{.}} outs(%[[T4]]
	%2 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%1 : tensor<24x25xf32>) -> tensor<24x25xf32>
	return %2 : tensor<24x25xf32>
	}

	// -----

	// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0 + d1)>
	#map0 = affine_map<(d0, d1) -> (d1, d0)>

	// MATMUL: fuse_indexed
	// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xi32>
	builtin.func @fuse_indexed(%arg0: tensor<24x12xi32>,
	%arg1: tensor<12x25xi32>,
	%arg2: tensor<24x25xi32>) -> tensor<24x25xi32> {
	%c0 = arith.constant 0 : index
	%c12 = arith.constant 12 : index
	%c25 = arith.constant 25 : index
	%c24 = arith.constant 24 : index
	%c4 = arith.constant 4 : index
	%0 = linalg.generic {indexing_maps = [#map0], iterator_types = ["parallel", "parallel"]} outs(%arg1 : tensor<12x25xi32>) {
	^bb0(%arg3: i32): // no predecessors
	%6 = linalg.index 0 : index
	%7 = linalg.index 1 : index
	%8 = arith.addi %6, %7 : index
	%9 = arith.index_cast %8 : index to i32
	linalg.yield %9 : i32
	} -> tensor<12x25xi32>

	// MATMUL: scf.for %[[IV0:[0-9a-zA-Z]*]] =
	// MATMUL: scf.for %[[IV1:[0-9a-zA-Z]*]] =
	// MATMUL: scf.for %[[IV2:[0-9a-zA-Z]*]] =

	// Shift the indexes by the slice offsets and swap the offsets due to the transposed indexing map.
	// MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
	// MATMUL-SAME: %[[IV2]], %[[IV0]]
	// MATMUL: linalg.generic {{.*}} outs(%[[T1]]
	// MATMUL: %[[IDX0:.*]] = linalg.index 0
	// MATMUL: %[[IDX0_SHIFTED:.*]] = affine.apply #[[MAP0]](%[[IDX0]], %[[IV0]])
	// MATMUL: %[[IDX1:.*]] = linalg.index 1
	// MATMUL: %[[IDX1_SHIFTED:.*]] = affine.apply #[[MAP0]](%[[IDX1]], %[[IV2]])
	// MATMUL: %{{.*}} = arith.addi %[[IDX0_SHIFTED]], %[[IDX1_SHIFTED]]
	%1 = linalg.matmul ins(%arg0, %0 : tensor<24x12xi32>, tensor<12x25xi32>) outs(%arg2 : tensor<24x25xi32>) -> tensor<24x25xi32>
	return %1 : tensor<24x25xi32>
	}

	// -----

	#map0 = affine_map<(d0, d1) -> (d0, d1)>
	#map1 = affine_map<(d0, d1) -> (d0)>

	// GENERIC: fuse_outermost_reduction
	// GENERIC-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<10x17xf32>
	// GENERIC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<10xf32>
	func @fuse_outermost_reduction(%arg0: tensor<10x17xf32>,
	%arg1: tensor<10xf32>) -> tensor<10xf32> {
	%cst = arith.constant 0.000000e+00 : f32
	%0 = linalg.fill(%cst, %arg0) : f32, tensor<10x17xf32> -> tensor<10x17xf32>

	// Cannot fuse the output fill since the reduction loop is the outermost loop.
	// GENERIC: %[[T0:.]] = linalg.fill(%{{.}}, %[[ARG1]])
	%1 = linalg.fill(%cst, %arg1) : f32, tensor<10xf32> -> tensor<10xf32>

	// GENERIC: scf.for %[[IV0:[0-9a-zA-Z]]] = {{.}} iter_args(%[[ARG2:.*]] = %[[T0]]
	// GENERIC: scf.for %[[IV1:[0-9a-zA-Z]]] = {{.}} iter_args(%[[ARG3:.*]] = %[[ARG2]]

	// MATMUL the input fill has been fused.
	// GENERIC: %[[T1:.*]] = tensor.extract_slice %[[ARG0]]
	// GENERIC-SAME: %[[IV1]], %[[IV0]]
	// GENERIC: %[[T2:.]] = linalg.fill(%{{.}}, %[[T1]])
	// GENERIC: %[[T3:.*]] = tensor.extract_slice %[[ARG3]]
	// GENERIC-SAME: %[[IV1]]
	// GENERIC: linalg.generic {{.}} ins(%[[T2]] {{.}} outs(%[[T3]]
	%2 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "reduction"]} ins(%0 : tensor<10x17xf32>) outs(%1 : tensor<10xf32>) {
	^bb0(%arg2: f32, %arg3: f32): // no predecessors
	%3 = arith.addf %arg2, %arg3 : f32
	linalg.yield %3 : f32
	} -> tensor<10xf32>
	return %2 : tensor<10xf32>
	}

	// -----

	// GENERIC-DAG: #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0 + d1)>
	// GENERIC-DAG: #[[MAP1:.*]] = affine_map<(d0, d1) -> (8, -d0 - d1 + 17)>
	// GENERIC-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (d0, -d1 - d2 + 17)>
	#map0 = affine_map<(d0, d1) -> (d0, d0 + d1)>
	#map1 = affine_map<(d0, d1) -> (d0, d1)>

	// GENERIC: fuse_non_rectangular
	// GENERIC-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<10x17xf32>
	func @fuse_non_rectangular(%arg0: tensor<10x17xf32>,
	%arg1: tensor<10x8xf32>) -> tensor<10x8xf32> {

	// GENERIC-DAG: %[[C0:.*]] = arith.constant 0 : index
	// GENERIC-DAG: %[[C4:.*]] = arith.constant 4 : index
	// GENERIC-DAG: %[[C5:.*]] = arith.constant 5 : index
	// GENERIC-DAG: %[[C8:.*]] = arith.constant 8 : index
	// GENERIC-DAG: %[[C10:.*]] = arith.constant 10 : index
	%cst = arith.constant 0.000000e+00 : f32
	%0 = linalg.fill(%cst, %arg0) : f32, tensor<10x17xf32> -> tensor<10x17xf32>

	// GENERIC: scf.for %[[IV0:[0-9a-zA-Z]*]] = %[[C0]] to %[[C8]] step %[[C4]]
	// GENERIC: scf.for %[[IV1:[0-9a-zA-Z]*]] = %[[C0]] to %[[C10]] step %[[C5]]

	// Compute producer on a hyper rectangular bounding box. Along the second dimenson,
	// the offset is set to the sum of the induction variables, and the upper bound
	// to either 8 (tile size) or 17 (sum of max indices (9+7) then + 1) minus the
	// induction variables.
	// GENERIC-DAG: %[[SUM:.*]] = affine.apply #[[MAP0]](%[[IV1]], %[[IV0]]
	// GENERIC-DAG: %[[TS1:.*]] = affine.min #[[MAP1]](%[[IV1]], %[[IV0]]
	// GENERIC-DAG: %[[UB1:.*]] = affine.min #[[MAP2]](%[[TS1]], %[[IV1]], %[[IV0]]
	// GENERIC: %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
	// GENERIC-SAME: %[[IV1]], %[[SUM]]
	// GENERIC-SAME: , %[[UB1]]
	// GENERIC: %[[T1:.]] = linalg.fill(%{{.}}, %[[T0]])
	%1 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x17xf32>) outs(%arg1 : tensor<10x8xf32>) {
	^bb0(%arg2: f32, %arg3: f32): // no predecessors
	%2 = arith.addf %arg2, %arg3 : f32
	linalg.yield %2 : f32
	} -> tensor<10x8xf32>
	return %1 : tensor<10x8xf32>
	}