mlir/test/Dialect/Linalg/pad.mlir - llvm-project - Git at Google

 // RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad pack-paddings=1,1,0 run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=MATMUL
 // RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.fill pad pack-paddings=1,1,0 run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=FILL
 // RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad pack-paddings=1,1,0 pad-inputs-only run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=INPUTS-ONLY

 // MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0] -> (7, -s0 + 12)>
 // MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 7)>
 #map = affine_map<()[s0] -> (7, -s0 + 12)>

 //      MATMUL:  static_sizes_output_divisible
 // MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
 // MATMUL-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>
 // MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
 // MATMUL-SAME:    %[[IV0:[0-9a-zA-Z]*]]: index
 // MATMUL-SAME:    %[[IV1:[0-9a-zA-Z]*]]: index
 // MATMUL-SAME:    %[[IV2:[0-9a-zA-Z]*]]: index
 func @static_sizes_output_divisible(%arg0: tensor<24x12xf32>,
                                     %arg1: tensor<12x25xf32>,
                                     %arg2: tensor<24x25xf32>,
                                     %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
   //  MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index

   //      MATMUL:   %[[TS2:.*]] = affine.min #[[MAP0]]()[%[[IV2]]]
   %0 = affine.min #map()[%iv2]

   //      MATMUL:   %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
   //      MATMUL:   %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
   //      MATMUL:   %[[T2:.*]] = tensor.extract_slice %[[ARG2]]
   %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
   %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
   %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>

   // Check statically sized matmul inputs with partially divisible sizes are padded.
   //      MATMUL:   %[[V0:.*]] = affine.apply #[[MAP1]]()[%[[TS2]]]
   //      MATMUL:   %[[T3:.*]] = linalg.pad_tensor %[[T0]] nofold
   // MATMUL-SAME:                  [%[[C0]], %[[C0]]]
   // MATMUL-SAME:                  [%[[C0]], %[[V0]]
   //      MATMUL:   %[[T4:.*]] = linalg.pad_tensor %[[T1]] nofold

   // Check the statically sized matmul output with fully divisible sizes is not padded.
   //      MATMUL:   %[[T5:.*]] = linalg.matmul
   // MATMUL-SAME:                  ins(%[[T3]], %[[T4]] : tensor<4x7xf32>, tensor<7x5xf32>)
   // MATMUL-SAME:                  outs(%[[T2]] : tensor<4x5xf32>)
   //      MATMUL:   %[[T6:.*]] = tensor.insert_slice %[[T5]]
   %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
   %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
   return %5 : tensor<24x25xf32>
 }

 // -----

 // MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0] -> (7, -s0 + 25)>
 // MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 7)>
 #map = affine_map<()[s0] -> (7, -s0 + 25)>

 //      MATMUL:  static_sizes_input_divisible
 // MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
 // MATMUL-SAME:    %[[IV0:[0-9a-zA-Z]*]]: index
 // MATMUL-SAME:    %[[IV1:[0-9a-zA-Z]*]]: index
 // MATMUL-SAME:    %[[IV2:[0-9a-zA-Z]*]]: index
 func @static_sizes_input_divisible(%arg0: tensor<24x12xf32>,
                                    %arg1: tensor<12x25xf32>,
                                    %arg2: tensor<24x25xf32>,
                                    %iv0 : index, %iv1 : index, %iv2 : index) ->  tensor<24x25xf32> {
   //  MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index

   %3 = tensor.extract_slice %arg0[%iv0, %iv2] [4, 6] [1, 1] : tensor<24x12xf32> to tensor<4x6xf32>

   //      MATMUL:   %[[TS1:.*]] = affine.min #[[MAP0]]()[%[[IV1]]]
   %4 = affine.min #map()[%iv1]
   %5 = tensor.extract_slice %arg1[%iv2, %iv1] [6, %4] [1, 1] : tensor<12x25xf32> to tensor<6x?xf32>

   //      MATMUL:   %[[T0:.*]] = tensor.extract_slice %[[ARG2]]
   %6 = tensor.extract_slice %arg2[%iv0, %iv1] [4, %4] [1, 1] : tensor<24x25xf32> to tensor<4x?xf32>

   // Check the statically sized matmul output with partially divisible sizes is padded.
   //      MATMUL:   %[[V0:.*]] = affine.apply #[[MAP1]]()[%[[TS1]]]
   //      MATMUL:   %[[T1:.*]] = linalg.pad_tensor %[[T0]] low
   // MATMUL-SAME:                  [%[[C0]], %[[C0]]]
   // MATMUL-SAME:                  [%[[C0]], %[[V0]]

   //      MATMUL:   %[[T2:.*]] = linalg.matmul
   // MATMUL-SAME:                  outs(%[[T1]] : tensor<4x7xf32>)
   //      MATMUL:   %[[T3:.*]] = tensor.extract_slice %[[T2]]
   //      MATMUL:   %[[T4:.*]] = tensor.insert_slice %[[T3]]
   %7 = linalg.matmul ins(%3, %5 : tensor<4x6xf32>, tensor<6x?xf32>) outs(%6 : tensor<4x?xf32>) -> tensor<4x?xf32>
   %8 = tensor.insert_slice %7 into %arg2[%iv0, %iv1] [4, %4] [1, 1] : tensor<4x?xf32> into tensor<24x25xf32>

    //      MATMUL:   return %[[T4]]
   return %8 : tensor<24x25xf32>
 }

 // -----

 // MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0, s1] -> (5, -s0 + s1)>
 // MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0, s1] -> (7, -s0 + s1)>
 // MATMUL-DAG: #[[MAP2:[0-9a-z]+]] = affine_map<()[s0, s1] -> (6, -s0 + s1)>
 // MATMUL-DAG: #[[MAP3:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 5)>
 // MATMUL-DAG: #[[MAP4:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 6)>

 #map0 = affine_map<()[s0, s1] -> (5, -s0 + s1)>
 #map1 = affine_map<()[s0, s1] -> (6, -s0 + s1)>
 #map2 = affine_map<()[s0, s1] -> (7, -s0 + s1)>

 //      MATMUL:  dynamic_sizes
 // MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<?x?xf32>
 // MATMUL-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<?x?xf32>
 // MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<?x?xf32>
 // MATMUL-SAME:    %[[IV0:[0-9a-zA-Z]*]]: index
 // MATMUL-SAME:    %[[IV1:[0-9a-zA-Z]*]]: index
 // MATMUL-SAME:    %[[IV2:[0-9a-zA-Z]*]]: index
 func @dynamic_sizes(%arg0: tensor<?x?xf32>,
                     %arg1: tensor<?x?xf32>,
                     %arg2: tensor<?x?xf32>,
                     %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<?x?xf32> {
   //  MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index
   //  MATMUL-DAG: %[[C1:.*]] = arith.constant 1
   %c1 = arith.constant 1 : index
   %c0 = arith.constant 0 : index

   //  MATMUL-DAG: %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]]
   //  MATMUL-DAG: %[[D2:.*]] = tensor.dim %[[ARG0]], %[[C1]]
   //  MATMUL-DAG: %[[D1:.*]] = tensor.dim %[[ARG1]], %[[C1]]
   %0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
   %1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
   %2 = tensor.dim %arg1, %c1 : tensor<?x?xf32>

   //      MATMUL:   %[[TS0:.*]] = affine.min #[[MAP0]]()[%[[IV0]], %[[D0]]]
   //      MATMUL:   %[[TS2:.*]] = affine.min #[[MAP2]]()[%[[IV2]], %[[D2]]]
   //      MATMUL:   %[[TS1:.*]] = affine.min #[[MAP1]]()[%[[IV1]], %[[D1]]]
   %6 = affine.min #map0()[%iv0, %0]
   %7 = affine.min #map1()[%iv2, %1]
   %8 = tensor.extract_slice %arg0[%iv0, %iv2] [%6, %7] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
   %9 = affine.min #map2()[%iv1, %2]
   %10 = tensor.extract_slice %arg1[%iv2, %iv1] [%7, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
   %11 = tensor.extract_slice %arg2[%iv0, %iv1] [%6, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>

   // Check all matmul operands are padded.
   //      MATMUL:   %[[V0:.*]] = affine.apply #[[MAP3]]()[%[[TS0]]]
   //      MATMUL:   %[[V1:.*]] = affine.apply #[[MAP4]]()[%[[TS2]]]
   //      MATMUL:   %[[T3:.*]] = linalg.pad_tensor %{{.*}} nofold
   // MATMUL-SAME:                  [%[[C0]], %[[C0]]]
   // MATMUL-SAME:                  [%[[V0]], %[[V1]]
   //      MATMUL:   %[[T4:.*]] = linalg.pad_tensor %{{.*}} nofold
   //      MATMUL:   %[[T5:.*]] = linalg.pad_tensor %{{.*}} low

   // Check the dynamic matmul has been erased.
   //  MATMUL-NOT:   = linalg.matmul {{.*}} tensor<?x?xf32>

   // Check all padded matmul operands are statically sized.
   //      MATMUL:   %[[T6:.*]] = linalg.matmul
   // MATMUL-SAME:                  ins(%[[T3]], %[[T4]] : tensor<5x6xf32>, tensor<6x7xf32>)
   // MATMUL-SAME:                  outs(%[[T5]] : tensor<5x7xf32>)
   //      MATMUL:   %[[T7:.*]] = tensor.extract_slice %[[T6]][0, 0] [%[[TS0]], %[[TS1]]]
   //      MATMUL:   %[[T8:.*]] = tensor.insert_slice %[[T7]]
   %12 = linalg.matmul ins(%8, %10 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%11 : tensor<?x?xf32>) -> tensor<?x?xf32>
   %13 = tensor.insert_slice %12 into %arg2[%iv0, %iv1] [%6, %9] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>

   //      MATMUL:   return %[[T8]]
   return %13 : tensor<?x?xf32>
 }

 // -----

 #map0 = affine_map<()[s0] -> (64, s0)>

 //      FILL:  pad_multiple
 // FILL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32>
 func @pad_multiple(%arg0: tensor<64x64xf32>,
                       %iv0 : index) -> tensor<?x?xf32> {
   %cst = arith.constant 0.0 : f32
   %size = affine.min #map0()[%iv0]
   %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>

   // Check both fill operations are padded by the same pad tensor operation.
   //      FILL:  %[[T0:.*]] = linalg.pad_tensor
   //      FILL:  %[[T1:.*]] = linalg.fill(%{{.*}}, %[[T0]])
   //      FILL:  %[[T2:.*]] = linalg.fill(%{{.*}}, %[[T1]])
   //      FILL:  = tensor.extract_slice %[[T2]]
   %1 = linalg.fill(%cst, %0) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
   %2 = linalg.fill(%cst, %1) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
   return %2 : tensor<?x?xf32>
 }

 // -----

 #map0 = affine_map<()[s0] -> (64, s0)>

 //      MATMUL:  compose_padding
 // MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32>
 func @compose_padding(%arg0: tensor<64x64xf32>,
                       %iv0 : index) -> tensor<?x?xf32> {
   %cst = arith.constant 0.0 : f32

   //      MATMUL:  %[[SIZE:.*]] = affine.min
   %size = affine.min #map0()[%iv0]

   //      MATMUL:  %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
   // MATMUL-SAME:                                     [0, 0]
   // MATMUL-SAME:                                     [%[[SIZE]], %[[SIZE]]]
   //      MATMUL:  %[[T1:.*]] = linalg.pad_tensor %[[T0]]
   //      MATMUL:  %[[T2:.*]] = linalg.fill(%{{.*}}, %[[T1]]
   //      MATMUL:  %[[T3:.*]] = linalg.fill(%{{.*}}, %[[T2]]
   %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
   %1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0]  {
     ^bb0(%arg3: index, %arg4: index):  // no predecessors
       linalg.yield %cst : f32
   } : tensor<?x?xf32> to tensor<64x64xf32>
   %2 = linalg.fill(%cst, %1) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
   %3 = linalg.fill(%cst, %2) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
   %4 = tensor.extract_slice %3[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>

   // Check there are no additional pad tensor operations.
   //  MATMUL-NOT:  linalg.pad_tensor

   // Check the matmul directly uses the result of the fill operation.
   //      MATMUL:  %[[T4:.*]] = linalg.matmul ins(%[[T3]]
   //      MATMUL:  %[[T5:.*]] = tensor.extract_slice %[[T4]]
   // MATMUL-SAME:                                     [0, 0]
   // MATMUL-SAME:                                     [%[[SIZE]], %[[SIZE]]]
   %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>

   //      MATMUL:  return %[[T5]]
   return %5 : tensor<?x?xf32>
 }

 // -----

 #map0 = affine_map<()[s0] -> (64, s0)>

 //      MATMUL:  different_padding_values
 func @different_padding_values(%arg0: tensor<64x64xf32>,
                                %iv0 : index) -> tensor<?x?xf32> {
   %cst = arith.constant 42.0 : f32
   %size = affine.min #map0()[%iv0]
   %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
   %1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0]  {
     ^bb0(%arg3: index, %arg4: index):  // no predecessors
       linalg.yield %cst : f32
   } : tensor<?x?xf32> to tensor<64x64xf32>
   %2 = linalg.fill(%cst, %1) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
   %4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>

   // Different padding values prevent composing the paddings (42.0 vs. 0.0).
   //      MATMUL:  = linalg.fill
   //      MATMUL:  = linalg.pad_tensor
   //      MATMUL:  = linalg.matmul
   %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
   return %5 : tensor<?x?xf32>
 }

 // -----

 #map0 = affine_map<()[s0] -> (64, s0)>

 //      MATMUL:  different_padding_dynamic_sizes
 func @different_padding_dynamic_sizes(%arg0: tensor<64x64xf32>,
                                       %iv0 : index) -> tensor<?x?xf32> {
   %cst = arith.constant 0.0 : f32
   %size = affine.min #map0()[%iv0]
   %0 = tensor.extract_slice %arg0[0, 0] [%iv0, %iv0] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
   %1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0]  {
     ^bb0(%arg3: index, %arg4: index):  // no predecessors
       linalg.yield %cst : f32
   } : tensor<?x?xf32> to tensor<64x64xf32>
   %2 = linalg.fill(%cst, %1) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
   %4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>

   // Different dynamic sizes prevent composing the paddings (%iv0 vs %size).
   //      MATMUL:  = linalg.fill
   //      MATMUL:  = linalg.pad_tensor
   //      MATMUL:  = linalg.matmul
   %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
   return %5 : tensor<?x?xf32>
 }

 // -----

 #map0 = affine_map<()[s0] -> (64, s0)>

 //      MATMUL:  different_padding_static_sizes
 func @different_padding_static_sizes(%arg0: tensor<62x62xf32>,
                                      %iv0 : index) -> tensor<?x?xf32> {
   %cst = arith.constant 0.0 : f32
   %size = affine.min #map0()[%iv0]
   %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<62x62xf32> to tensor<?x?xf32>
   %1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0]  {
     ^bb0(%arg3: index, %arg4: index):  // no predecessors
       linalg.yield %cst : f32
   } : tensor<?x?xf32> to tensor<62x62xf32>
   %2 = linalg.fill(%cst, %1) : f32, tensor<62x62xf32> -> tensor<62x62xf32>
   %4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<62x62xf32> to tensor<?x?xf32>

   // Different static sizes prevent composing the paddings (62 vs 64 derived from #map0).
   //      MATMUL:  = linalg.fill
   //      MATMUL:  = linalg.pad_tensor
   //      MATMUL:  = linalg.matmul
   %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
   return %5 : tensor<?x?xf32>
 }

 // -----

 #map0 = affine_map<()[s0] -> (7, s0)>

 //      FILL:  scalar_operand
 // FILL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: f32
 // FILL-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<24x12xf32>
 func @scalar_operand(%arg0: f32,
                      %arg1: tensor<24x12xf32>,
                      %iv0 : index) -> tensor<24x12xf32> {
   %0 = affine.min #map0()[%iv0]

   //      FILL:   %[[T0:.*]] = tensor.extract_slice %[[ARG1]]
   //      FILL:   %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold
   %1 = tensor.extract_slice %arg1[0, 0] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>

   // Check only the fill output operand is padded.
   //      FILL:   %[[T6:.*]] = linalg.fill(%[[ARG0]], %[[T1]]
   %2 = linalg.fill(%arg0, %1) : f32, tensor<4x?xf32> -> tensor<4x?xf32>
   %3 = tensor.insert_slice %2 into %arg1[0, 0] [4, %0] [1, 1] : tensor<4x?xf32> into tensor<24x12xf32>
   return %3 : tensor<24x12xf32>
 }

 // -----

 #map0 = affine_map<()[s0] -> (7, s0)>

 //      MATMUL:  static_extract_slice_missing
 // MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<4x5xf32>,
 func @static_extract_slice_missing(%arg0: tensor<24x12xf32>,
                                    %arg1: tensor<12x25xf32>,
                                    %arg2: tensor<4x5xf32>,
                                    %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<4x5xf32> {
   %0 = affine.min #map0()[%iv2]
   %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
   %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>

   // Check the matmul inputs are padded despite the missing slice for the static output.
   //      MATMUL:  %[[T0:.*]] = linalg.pad_tensor
   //      MATMUL:  %[[T1:.*]] = linalg.pad_tensor
   //      MATMUL:  = linalg.matmul ins(%[[T0]], %[[T1]]
   // MATMUL-SAME:                 outs(%[[ARG2]]
   %3 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%arg2 : tensor<4x5xf32>) -> tensor<4x5xf32>
   return %3 : tensor<4x5xf32>
 }

 // -----

 #map0 = affine_map<()[s0] -> (7, s0)>

 //      MATMUL:  dynamic_extract_slice_missing
 // MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<4x?xf32>,
 // MATMUL-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>,
 // MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>,
 func @dynamic_extract_slice_missing(%arg0: tensor<4x?xf32>,
                                     %arg1: tensor<12x25xf32>,
                                     %arg2: tensor<24x25xf32>,
                                     %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
   %0 = affine.min #map0()[%iv2]

   //      MATMUL:  %[[T0:.*]] = tensor.extract_slice %[[ARG1]]
   //      MATMUL:  %[[T1:.*]] = tensor.extract_slice %[[ARG2]]
   %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
   %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>

   // Check the matmul is not padded due to the missing slice for the dynamic input.
   //      MATMUL:  = linalg.matmul ins(%[[ARG0]], %[[T0]]
   // MATMUL-SAME:                 outs(%[[T1]]
   %4 = linalg.matmul ins(%arg0, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
   %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
   return %5 : tensor<24x25xf32>
 }

 // -----

 #map0 = affine_map<()[s0] -> (7, s0)>

 //      INPUTS-ONLY:  static_input_padding_only
 // INPUTS-ONLY-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>,
 func @static_input_padding_only(%arg0: tensor<24x12xf32>,
                                 %arg1: tensor<12x25xf32>,
                                 %arg2: tensor<24x25xf32>,
                                 %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
   %0 = affine.min #map0()[%iv2]
   %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
   %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>

   // INPUTS-ONLY:  %[[T0:.*]] = tensor.extract_slice %[[ARG2]]
   %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>

   // Check the matmul inputs are padded despite the failure to compute a padding value for the static output.
   // INPUTS-ONLY:  %[[T1:.*]] = linalg.pad_tensor
   // INPUTS-ONLY:  %[[T2:.*]] = linalg.pad_tensor
   // INPUTS-ONLY:  = linalg.matmul ins(%[[T1]], %[[T2]]
   // INPUTS-ONLY-SAME:             outs(%[[T0]]
   %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
   %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
   return %5 : tensor<24x25xf32>
 }

 // -----

 #map0 = affine_map<()[s0] -> (7, s0)>

 //      INPUTS-ONLY:  dynamic_input_padding_only
 // INPUTS-ONLY-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>,
 // INPUTS-ONLY-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>,
 // INPUTS-ONLY-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>,
 func @dynamic_input_padding_only(%arg0: tensor<24x12xf32>,
                                  %arg1: tensor<12x25xf32>,
                                  %arg2: tensor<24x25xf32>,
                                  %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
   %0 = affine.min #map0()[%iv2]

   // INPUTS-ONLY:  %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
   // INPUTS-ONLY:  %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
   // INPUTS-ONLY:  %[[T2:.*]] = tensor.extract_slice %[[ARG2]]
   %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
   %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, %0] [1, 1] : tensor<12x25xf32> to tensor<?x?xf32>
   %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, %0] [1, 1] : tensor<24x25xf32> to tensor<4x?xf32>

   // Check the matmul is not padded due to the failure to compute a padding value for the dynamic output.
   // INPUTS-ONLY:  = linalg.matmul ins(%[[T0]], %[[T1]]
   // INPUTS-ONLY-SAME:             outs(%[[T2]]
   %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x?xf32>) outs(%3 : tensor<4x?xf32>) -> tensor<4x?xf32>
   %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, %0] [1, 1] : tensor<4x?xf32> into tensor<24x25xf32>
   return %5 : tensor<24x25xf32>
 }
	// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad pack-paddings=1,1,0 run-enable-pass=false" -cse -canonicalize -split-input-file \| FileCheck %s --check-prefix=MATMUL
	// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.fill pad pack-paddings=1,1,0 run-enable-pass=false" -cse -canonicalize -split-input-file \| FileCheck %s --check-prefix=FILL
	// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad pack-paddings=1,1,0 pad-inputs-only run-enable-pass=false" -cse -canonicalize -split-input-file \| FileCheck %s --check-prefix=INPUTS-ONLY

	// MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0] -> (7, -s0 + 12)>
	// MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 7)>
	#map = affine_map<()[s0] -> (7, -s0 + 12)>

	// MATMUL: static_sizes_output_divisible
	// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
	// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>
	// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
	// MATMUL-SAME: %[[IV0:[0-9a-zA-Z]*]]: index
	// MATMUL-SAME: %[[IV1:[0-9a-zA-Z]*]]: index
	// MATMUL-SAME: %[[IV2:[0-9a-zA-Z]*]]: index
	func @static_sizes_output_divisible(%arg0: tensor<24x12xf32>,
	%arg1: tensor<12x25xf32>,
	%arg2: tensor<24x25xf32>,
	%iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
	// MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index

	// MATMUL: %[[TS2:.*]] = affine.min #[[MAP0]]()[%[[IV2]]]
	%0 = affine.min #map()[%iv2]

	// MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
	// MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
	// MATMUL: %[[T2:.*]] = tensor.extract_slice %[[ARG2]]
	%1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
	%2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
	%3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>

	// Check statically sized matmul inputs with partially divisible sizes are padded.
	// MATMUL: %[[V0:.*]] = affine.apply #[[MAP1]]()[%[[TS2]]]
	// MATMUL: %[[T3:.*]] = linalg.pad_tensor %[[T0]] nofold
	// MATMUL-SAME: [%[[C0]], %[[C0]]]
	// MATMUL-SAME: [%[[C0]], %[[V0]]
	// MATMUL: %[[T4:.*]] = linalg.pad_tensor %[[T1]] nofold

	// Check the statically sized matmul output with fully divisible sizes is not padded.
	// MATMUL: %[[T5:.*]] = linalg.matmul
	// MATMUL-SAME: ins(%[[T3]], %[[T4]] : tensor<4x7xf32>, tensor<7x5xf32>)
	// MATMUL-SAME: outs(%[[T2]] : tensor<4x5xf32>)
	// MATMUL: %[[T6:.*]] = tensor.insert_slice %[[T5]]
	%4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
	%5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
	return %5 : tensor<24x25xf32>
	}

	// -----

	// MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0] -> (7, -s0 + 25)>
	// MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 7)>
	#map = affine_map<()[s0] -> (7, -s0 + 25)>

	// MATMUL: static_sizes_input_divisible
	// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
	// MATMUL-SAME: %[[IV0:[0-9a-zA-Z]*]]: index
	// MATMUL-SAME: %[[IV1:[0-9a-zA-Z]*]]: index
	// MATMUL-SAME: %[[IV2:[0-9a-zA-Z]*]]: index
	func @static_sizes_input_divisible(%arg0: tensor<24x12xf32>,
	%arg1: tensor<12x25xf32>,
	%arg2: tensor<24x25xf32>,
	%iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
	// MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index

	%3 = tensor.extract_slice %arg0[%iv0, %iv2] [4, 6] [1, 1] : tensor<24x12xf32> to tensor<4x6xf32>

	// MATMUL: %[[TS1:.*]] = affine.min #[[MAP0]]()[%[[IV1]]]
	%4 = affine.min #map()[%iv1]
	%5 = tensor.extract_slice %arg1[%iv2, %iv1] [6, %4] [1, 1] : tensor<12x25xf32> to tensor<6x?xf32>

	// MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG2]]
	%6 = tensor.extract_slice %arg2[%iv0, %iv1] [4, %4] [1, 1] : tensor<24x25xf32> to tensor<4x?xf32>

	// Check the statically sized matmul output with partially divisible sizes is padded.
	// MATMUL: %[[V0:.*]] = affine.apply #[[MAP1]]()[%[[TS1]]]
	// MATMUL: %[[T1:.*]] = linalg.pad_tensor %[[T0]] low
	// MATMUL-SAME: [%[[C0]], %[[C0]]]
	// MATMUL-SAME: [%[[C0]], %[[V0]]

	// MATMUL: %[[T2:.*]] = linalg.matmul
	// MATMUL-SAME: outs(%[[T1]] : tensor<4x7xf32>)
	// MATMUL: %[[T3:.*]] = tensor.extract_slice %[[T2]]
	// MATMUL: %[[T4:.*]] = tensor.insert_slice %[[T3]]
	%7 = linalg.matmul ins(%3, %5 : tensor<4x6xf32>, tensor<6x?xf32>) outs(%6 : tensor<4x?xf32>) -> tensor<4x?xf32>
	%8 = tensor.insert_slice %7 into %arg2[%iv0, %iv1] [4, %4] [1, 1] : tensor<4x?xf32> into tensor<24x25xf32>

	// MATMUL: return %[[T4]]
	return %8 : tensor<24x25xf32>
	}

	// -----

	// MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0, s1] -> (5, -s0 + s1)>
	// MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0, s1] -> (7, -s0 + s1)>
	// MATMUL-DAG: #[[MAP2:[0-9a-z]+]] = affine_map<()[s0, s1] -> (6, -s0 + s1)>
	// MATMUL-DAG: #[[MAP3:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 5)>
	// MATMUL-DAG: #[[MAP4:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 6)>

	#map0 = affine_map<()[s0, s1] -> (5, -s0 + s1)>
	#map1 = affine_map<()[s0, s1] -> (6, -s0 + s1)>
	#map2 = affine_map<()[s0, s1] -> (7, -s0 + s1)>

	// MATMUL: dynamic_sizes
	// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<?x?xf32>
	// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<?x?xf32>
	// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<?x?xf32>
	// MATMUL-SAME: %[[IV0:[0-9a-zA-Z]*]]: index
	// MATMUL-SAME: %[[IV1:[0-9a-zA-Z]*]]: index
	// MATMUL-SAME: %[[IV2:[0-9a-zA-Z]*]]: index
	func @dynamic_sizes(%arg0: tensor<?x?xf32>,
	%arg1: tensor<?x?xf32>,
	%arg2: tensor<?x?xf32>,
	%iv0 : index, %iv1 : index, %iv2 : index) -> tensor<?x?xf32> {
	// MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index
	// MATMUL-DAG: %[[C1:.*]] = arith.constant 1
	%c1 = arith.constant 1 : index
	%c0 = arith.constant 0 : index

	// MATMUL-DAG: %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]]
	// MATMUL-DAG: %[[D2:.*]] = tensor.dim %[[ARG0]], %[[C1]]
	// MATMUL-DAG: %[[D1:.*]] = tensor.dim %[[ARG1]], %[[C1]]
	%0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
	%1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
	%2 = tensor.dim %arg1, %c1 : tensor<?x?xf32>

	// MATMUL: %[[TS0:.*]] = affine.min #[[MAP0]]()[%[[IV0]], %[[D0]]]
	// MATMUL: %[[TS2:.*]] = affine.min #[[MAP2]]()[%[[IV2]], %[[D2]]]
	// MATMUL: %[[TS1:.*]] = affine.min #[[MAP1]]()[%[[IV1]], %[[D1]]]
	%6 = affine.min #map0()[%iv0, %0]
	%7 = affine.min #map1()[%iv2, %1]
	%8 = tensor.extract_slice %arg0[%iv0, %iv2] [%6, %7] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
	%9 = affine.min #map2()[%iv1, %2]
	%10 = tensor.extract_slice %arg1[%iv2, %iv1] [%7, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
	%11 = tensor.extract_slice %arg2[%iv0, %iv1] [%6, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>

	// Check all matmul operands are padded.
	// MATMUL: %[[V0:.*]] = affine.apply #[[MAP3]]()[%[[TS0]]]
	// MATMUL: %[[V1:.*]] = affine.apply #[[MAP4]]()[%[[TS2]]]
	// MATMUL: %[[T3:.]] = linalg.pad_tensor %{{.}} nofold
	// MATMUL-SAME: [%[[C0]], %[[C0]]]
	// MATMUL-SAME: [%[[V0]], %[[V1]]
	// MATMUL: %[[T4:.]] = linalg.pad_tensor %{{.}} nofold
	// MATMUL: %[[T5:.]] = linalg.pad_tensor %{{.}} low

	// Check the dynamic matmul has been erased.
	// MATMUL-NOT: = linalg.matmul {{.*}} tensor<?x?xf32>

	// Check all padded matmul operands are statically sized.
	// MATMUL: %[[T6:.*]] = linalg.matmul
	// MATMUL-SAME: ins(%[[T3]], %[[T4]] : tensor<5x6xf32>, tensor<6x7xf32>)
	// MATMUL-SAME: outs(%[[T5]] : tensor<5x7xf32>)
	// MATMUL: %[[T7:.*]] = tensor.extract_slice %[[T6]][0, 0] [%[[TS0]], %[[TS1]]]
	// MATMUL: %[[T8:.*]] = tensor.insert_slice %[[T7]]
	%12 = linalg.matmul ins(%8, %10 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%11 : tensor<?x?xf32>) -> tensor<?x?xf32>
	%13 = tensor.insert_slice %12 into %arg2[%iv0, %iv1] [%6, %9] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>

	// MATMUL: return %[[T8]]
	return %13 : tensor<?x?xf32>
	}

	// -----

	#map0 = affine_map<()[s0] -> (64, s0)>

	// FILL: pad_multiple
	// FILL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32>
	func @pad_multiple(%arg0: tensor<64x64xf32>,
	%iv0 : index) -> tensor<?x?xf32> {
	%cst = arith.constant 0.0 : f32
	%size = affine.min #map0()[%iv0]
	%0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>

	// Check both fill operations are padded by the same pad tensor operation.
	// FILL: %[[T0:.*]] = linalg.pad_tensor
	// FILL: %[[T1:.]] = linalg.fill(%{{.}}, %[[T0]])
	// FILL: %[[T2:.]] = linalg.fill(%{{.}}, %[[T1]])
	// FILL: = tensor.extract_slice %[[T2]]
	%1 = linalg.fill(%cst, %0) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
	%2 = linalg.fill(%cst, %1) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
	return %2 : tensor<?x?xf32>
	}

	// -----

	#map0 = affine_map<()[s0] -> (64, s0)>

	// MATMUL: compose_padding
	// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32>
	func @compose_padding(%arg0: tensor<64x64xf32>,
	%iv0 : index) -> tensor<?x?xf32> {
	%cst = arith.constant 0.0 : f32

	// MATMUL: %[[SIZE:.*]] = affine.min
	%size = affine.min #map0()[%iv0]

	// MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
	// MATMUL-SAME: [0, 0]
	// MATMUL-SAME: [%[[SIZE]], %[[SIZE]]]
	// MATMUL: %[[T1:.*]] = linalg.pad_tensor %[[T0]]
	// MATMUL: %[[T2:.]] = linalg.fill(%{{.}}, %[[T1]]
	// MATMUL: %[[T3:.]] = linalg.fill(%{{.}}, %[[T2]]
	%0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
	%1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0] {
	^bb0(%arg3: index, %arg4: index): // no predecessors
	linalg.yield %cst : f32
	} : tensor<?x?xf32> to tensor<64x64xf32>
	%2 = linalg.fill(%cst, %1) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
	%3 = linalg.fill(%cst, %2) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
	%4 = tensor.extract_slice %3[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>

	// Check there are no additional pad tensor operations.
	// MATMUL-NOT: linalg.pad_tensor

	// Check the matmul directly uses the result of the fill operation.
	// MATMUL: %[[T4:.*]] = linalg.matmul ins(%[[T3]]
	// MATMUL: %[[T5:.*]] = tensor.extract_slice %[[T4]]
	// MATMUL-SAME: [0, 0]
	// MATMUL-SAME: [%[[SIZE]], %[[SIZE]]]
	%5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>

	// MATMUL: return %[[T5]]
	return %5 : tensor<?x?xf32>
	}

	// -----

	#map0 = affine_map<()[s0] -> (64, s0)>

	// MATMUL: different_padding_values
	func @different_padding_values(%arg0: tensor<64x64xf32>,
	%iv0 : index) -> tensor<?x?xf32> {
	%cst = arith.constant 42.0 : f32
	%size = affine.min #map0()[%iv0]
	%0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
	%1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0] {
	^bb0(%arg3: index, %arg4: index): // no predecessors
	linalg.yield %cst : f32
	} : tensor<?x?xf32> to tensor<64x64xf32>
	%2 = linalg.fill(%cst, %1) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
	%4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>

	// Different padding values prevent composing the paddings (42.0 vs. 0.0).
	// MATMUL: = linalg.fill
	// MATMUL: = linalg.pad_tensor
	// MATMUL: = linalg.matmul
	%5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
	return %5 : tensor<?x?xf32>
	}

	// -----

	#map0 = affine_map<()[s0] -> (64, s0)>

	// MATMUL: different_padding_dynamic_sizes
	func @different_padding_dynamic_sizes(%arg0: tensor<64x64xf32>,
	%iv0 : index) -> tensor<?x?xf32> {
	%cst = arith.constant 0.0 : f32
	%size = affine.min #map0()[%iv0]
	%0 = tensor.extract_slice %arg0[0, 0] [%iv0, %iv0] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
	%1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0] {
	^bb0(%arg3: index, %arg4: index): // no predecessors
	linalg.yield %cst : f32
	} : tensor<?x?xf32> to tensor<64x64xf32>
	%2 = linalg.fill(%cst, %1) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
	%4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>

	// Different dynamic sizes prevent composing the paddings (%iv0 vs %size).
	// MATMUL: = linalg.fill
	// MATMUL: = linalg.pad_tensor
	// MATMUL: = linalg.matmul
	%5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
	return %5 : tensor<?x?xf32>
	}

	// -----

	#map0 = affine_map<()[s0] -> (64, s0)>

	// MATMUL: different_padding_static_sizes
	func @different_padding_static_sizes(%arg0: tensor<62x62xf32>,
	%iv0 : index) -> tensor<?x?xf32> {
	%cst = arith.constant 0.0 : f32
	%size = affine.min #map0()[%iv0]
	%0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<62x62xf32> to tensor<?x?xf32>
	%1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0] {
	^bb0(%arg3: index, %arg4: index): // no predecessors
	linalg.yield %cst : f32
	} : tensor<?x?xf32> to tensor<62x62xf32>
	%2 = linalg.fill(%cst, %1) : f32, tensor<62x62xf32> -> tensor<62x62xf32>
	%4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<62x62xf32> to tensor<?x?xf32>

	// Different static sizes prevent composing the paddings (62 vs 64 derived from #map0).
	// MATMUL: = linalg.fill
	// MATMUL: = linalg.pad_tensor
	// MATMUL: = linalg.matmul
	%5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
	return %5 : tensor<?x?xf32>
	}

	// -----

	#map0 = affine_map<()[s0] -> (7, s0)>

	// FILL: scalar_operand
	// FILL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: f32
	// FILL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<24x12xf32>
	func @scalar_operand(%arg0: f32,
	%arg1: tensor<24x12xf32>,
	%iv0 : index) -> tensor<24x12xf32> {
	%0 = affine.min #map0()[%iv0]

	// FILL: %[[T0:.*]] = tensor.extract_slice %[[ARG1]]
	// FILL: %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold
	%1 = tensor.extract_slice %arg1[0, 0] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>

	// Check only the fill output operand is padded.
	// FILL: %[[T6:.*]] = linalg.fill(%[[ARG0]], %[[T1]]
	%2 = linalg.fill(%arg0, %1) : f32, tensor<4x?xf32> -> tensor<4x?xf32>
	%3 = tensor.insert_slice %2 into %arg1[0, 0] [4, %0] [1, 1] : tensor<4x?xf32> into tensor<24x12xf32>
	return %3 : tensor<24x12xf32>
	}

	// -----

	#map0 = affine_map<()[s0] -> (7, s0)>

	// MATMUL: static_extract_slice_missing
	// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<4x5xf32>,
	func @static_extract_slice_missing(%arg0: tensor<24x12xf32>,
	%arg1: tensor<12x25xf32>,
	%arg2: tensor<4x5xf32>,
	%iv0 : index, %iv1 : index, %iv2 : index) -> tensor<4x5xf32> {
	%0 = affine.min #map0()[%iv2]
	%1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
	%2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>

	// Check the matmul inputs are padded despite the missing slice for the static output.
	// MATMUL: %[[T0:.*]] = linalg.pad_tensor
	// MATMUL: %[[T1:.*]] = linalg.pad_tensor
	// MATMUL: = linalg.matmul ins(%[[T0]], %[[T1]]
	// MATMUL-SAME: outs(%[[ARG2]]
	%3 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%arg2 : tensor<4x5xf32>) -> tensor<4x5xf32>
	return %3 : tensor<4x5xf32>
	}

	// -----

	#map0 = affine_map<()[s0] -> (7, s0)>

	// MATMUL: dynamic_extract_slice_missing
	// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<4x?xf32>,
	// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>,
	// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>,
	func @dynamic_extract_slice_missing(%arg0: tensor<4x?xf32>,
	%arg1: tensor<12x25xf32>,
	%arg2: tensor<24x25xf32>,
	%iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
	%0 = affine.min #map0()[%iv2]

	// MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG1]]
	// MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG2]]
	%2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
	%3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>

	// Check the matmul is not padded due to the missing slice for the dynamic input.
	// MATMUL: = linalg.matmul ins(%[[ARG0]], %[[T0]]
	// MATMUL-SAME: outs(%[[T1]]
	%4 = linalg.matmul ins(%arg0, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
	%5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
	return %5 : tensor<24x25xf32>
	}

	// -----

	#map0 = affine_map<()[s0] -> (7, s0)>

	// INPUTS-ONLY: static_input_padding_only
	// INPUTS-ONLY-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>,
	func @static_input_padding_only(%arg0: tensor<24x12xf32>,
	%arg1: tensor<12x25xf32>,
	%arg2: tensor<24x25xf32>,
	%iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
	%0 = affine.min #map0()[%iv2]
	%1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
	%2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>

	// INPUTS-ONLY: %[[T0:.*]] = tensor.extract_slice %[[ARG2]]
	%3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>

	// Check the matmul inputs are padded despite the failure to compute a padding value for the static output.
	// INPUTS-ONLY: %[[T1:.*]] = linalg.pad_tensor
	// INPUTS-ONLY: %[[T2:.*]] = linalg.pad_tensor
	// INPUTS-ONLY: = linalg.matmul ins(%[[T1]], %[[T2]]
	// INPUTS-ONLY-SAME: outs(%[[T0]]
	%4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
	%5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
	return %5 : tensor<24x25xf32>
	}

	// -----

	#map0 = affine_map<()[s0] -> (7, s0)>

	// INPUTS-ONLY: dynamic_input_padding_only
	// INPUTS-ONLY-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>,
	// INPUTS-ONLY-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>,
	// INPUTS-ONLY-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>,
	func @dynamic_input_padding_only(%arg0: tensor<24x12xf32>,
	%arg1: tensor<12x25xf32>,
	%arg2: tensor<24x25xf32>,
	%iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
	%0 = affine.min #map0()[%iv2]

	// INPUTS-ONLY: %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
	// INPUTS-ONLY: %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
	// INPUTS-ONLY: %[[T2:.*]] = tensor.extract_slice %[[ARG2]]
	%1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
	%2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, %0] [1, 1] : tensor<12x25xf32> to tensor<?x?xf32>
	%3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, %0] [1, 1] : tensor<24x25xf32> to tensor<4x?xf32>

	// Check the matmul is not padded due to the failure to compute a padding value for the dynamic output.
	// INPUTS-ONLY: = linalg.matmul ins(%[[T0]], %[[T1]]
	// INPUTS-ONLY-SAME: outs(%[[T2]]
	%4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x?xf32>) outs(%3 : tensor<4x?xf32>) -> tensor<4x?xf32>
	%5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, %0] [1, 1] : tensor<4x?xf32> into tensor<24x25xf32>
	return %5 : tensor<24x25xf32>
	}