[mlir][linalg] Run CSE after every CodegenStrategy transformation.

Add CSE after every transformation. Transformations such as tiling introduce redundant computation, for example, one AffineMinOp for every operand dimension pair. Follow up transformations such as Padding and Hoisting benefit from CSE since comparing slice sizes simplifies to comparing SSA values instead of analyzing affine expressions.

Reviewed By: nicolasvasilache

Differential Revision: https://reviews.llvm.org/D114585
diff --git a/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp b/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp
index 8ed43c8..8beeba7 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp
@@ -25,9 +25,11 @@
 #include "mlir/Dialect/Vector/VectorTransforms.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
+#include "mlir/Pass/PassManager.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
 #include "mlir/Transforms/Utils.h"
 
 using namespace mlir;
@@ -335,6 +337,12 @@
 
     if (options.hoistRedundantVectorTransfersOnTensor)
       hoistRedundantVectorTransfersOnTensor(funcOp);
+
+    // Run CSE to cleanup after canonicalization.
+    OpPassManager dynamicPM("builtin.func");
+    dynamicPM.addPass(createCSEPass());
+    if (failed(runPipeline(dynamicPM, funcOp)))
+      return signalPassFailure();
   }
 
   LinalgEnablingOptions options;
diff --git a/mlir/test/Dialect/Linalg/codegen-strategy.mlir b/mlir/test/Dialect/Linalg/codegen-strategy.mlir
index fc65a59..d1deffa 100644
--- a/mlir/test/Dialect/Linalg/codegen-strategy.mlir
+++ b/mlir/test/Dialect/Linalg/codegen-strategy.mlir
@@ -40,14 +40,19 @@
 
 // -----
 
+//     CHECK-PAD-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0) -> (16, -d0 + 72)>
+
 //         CHECK-PAD: func @matmul(
 func @matmul(%arg0: tensor<72x72xf32>, %arg1: tensor<72x72xf32>, %arg2: tensor<72x72xf32>) -> tensor<72x72xf32> {
 
   // Check the padding of the input operands has been hoisted out of the tile loop nest.
   //      CHECK-PAD-COUNT=2: linalg.pad_tensor %{{.*}} nofold
-  //      CHECK-PAD-COUNT=3: scf.for
+  //              CHECK-PAD: scf.for
+  // Check CSE eliminates the duplicate min operations introduced by tiling.
+  //              CHECK-PAD: affine.min #[[MAP0]]
+  //          CHECK-PAD-NOT: affine.min #[[MAP0]]
+  //      CHECK-PAD-COUNT=2: scf.for
   //              CHECK-PAD: linalg.matmul
   %0 = linalg.matmul ins(%arg0, %arg1: tensor<72x72xf32>, tensor<72x72xf32>) outs(%arg2: tensor<72x72xf32>) -> tensor<72x72xf32>
   return %0 : tensor<72x72xf32>
 }
-