[mlir][linalg] Use top down traversal for padding.

Pad the operation using a top down traversal. The top down traversal unlocks folding opportunities and dim op canonicalizations due to the introduced extract slice operation after the padded operation.

Depends On D114585

Reviewed By: nicolasvasilache

Differential Revision: https://reviews.llvm.org/D114689
diff --git a/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp b/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp
index 8beeba7..c1b887e 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp
@@ -65,10 +65,10 @@
           funcOp.getContext(), options, filter);
     }
     // Search the root operation using bottom up traversal.
-    GreedyRewriteConfig grc;
-    grc.useTopDownTraversal = false;
-    (void)applyPatternsAndFoldGreedily(funcOp,
-                                       std::move(tilingAndFusionPattern), grc);
+    GreedyRewriteConfig config;
+    config.useTopDownTraversal = false;
+    (void)applyPatternsAndFoldGreedily(
+        funcOp, std::move(tilingAndFusionPattern), config);
   }
 
   LinalgTilingAndFusionOptions options;
@@ -132,7 +132,18 @@
       paddingPattern.add<LinalgPaddingPattern>(funcOp.getContext(), options,
                                                filter);
     }
-    if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(paddingPattern))))
+    // Traverse the operations top down to pad producers before consumers. The
+    // extract slice operation introduced after every padded operation enables
+    // padding its consumers. Padding an operation whose producers have not been
+    // padded before fails due to the missing extract slice operations. In this
+    // case, the padding pattern increments the transformation marker without
+    // padding the operation. The top down traversal is thus not only a
+    // performance optimization but also needed to pad all operations along the
+    // use-def chains.
+    GreedyRewriteConfig config;
+    config.useTopDownTraversal = true;
+    if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(paddingPattern),
+                                            config)))
       signalPassFailure();
   }
 
diff --git a/mlir/test/Dialect/Linalg/pad.mlir b/mlir/test/Dialect/Linalg/pad.mlir
index d686f7d..f2154a8 100644
--- a/mlir/test/Dialect/Linalg/pad.mlir
+++ b/mlir/test/Dialect/Linalg/pad.mlir
@@ -163,6 +163,28 @@
 
 #map0 = affine_map<()[s0] -> (64, s0)>
 
+//      FILL:  pad_multiple
+// FILL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32>
+func @pad_multiple(%arg0: tensor<64x64xf32>,
+                      %iv0 : index) -> tensor<?x?xf32> {
+  %cst = arith.constant 0.0 : f32
+  %size = affine.min #map0()[%iv0]
+  %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
+
+  // Check both fill operations are padded by the same pad tensor operation.
+  //      FILL:  %[[T0:.*]] = linalg.pad_tensor
+  //      FILL:  %[[T1:.*]] = linalg.fill(%{{.*}}, %[[T0]])
+  //      FILL:  %[[T2:.*]] = linalg.fill(%{{.*}}, %[[T1]])
+  //      FILL:  = tensor.extract_slice %[[T2]]
+  %1 = linalg.fill(%cst, %0) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+  %2 = linalg.fill(%cst, %1) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+  return %2 : tensor<?x?xf32>
+}
+
+// -----
+
+#map0 = affine_map<()[s0] -> (64, s0)>
+
 //      MATMUL:  compose_padding
 // MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32>
 func @compose_padding(%arg0: tensor<64x64xf32>,