| // RUN: transform-opt-ch2 %s \ |
| // RUN: --pass-pipeline="builtin.module(transform-interpreter{ \ |
| // RUN: debug-bind-trailing-args=linalg.matmul,linalg.elementwise},\ |
| // RUN: canonicalize,cse,symbol-dce)" |\ |
| // RUN: FileCheck %s |
| |
| // ****************************** IMPORTANT NOTE ****************************** |
| // |
| // If you are changing this file, you may also need to change |
| // mlir/docs/Tutorials/Transform accordingly. |
| // |
| // **************************************************************************** |
| |
| // Original function to optimize. |
| func.func @fc_relu(%lhs: tensor<512x512xf32>, %rhs: tensor<512x512xf32>, |
| %bias: tensor<512x512xf32>, %output: tensor<512x512xf32>) |
| -> tensor<512x512xf32> { |
| // Matrix-matrix multiplication. |
| %matmul = linalg.matmul ins(%lhs, %rhs: tensor<512x512xf32>, tensor<512x512xf32>) |
| outs(%output: tensor<512x512xf32>) -> tensor<512x512xf32> |
| |
| // Elementwise addition. |
| %biased = linalg.elementwise kind=#linalg.elementwise_kind<add> |
| ins(%matmul, %bias : tensor<512x512xf32>, tensor<512x512xf32>) |
| outs(%output : tensor<512x512xf32>) -> tensor<512x512xf32> |
| |
| // Elementwise max with 0 (ReLU). |
| %c0f = arith.constant dense<0.0> : tensor<512x512xf32> |
| %relued = linalg.elementwise kind=#linalg.elementwise_kind<max_signed> |
| ins(%biased, %c0f : tensor<512x512xf32>, tensor<512x512xf32>) |
| outs(%output : tensor<512x512xf32>) -> tensor<512x512xf32> |
| func.return %relued : tensor<512x512xf32> |
| } |
| |
| // CHECK-LABEL: func @fc_relu |
| // CHECK: scf.forall |
| // CHECK: scf.forall |
| // CHECK: %[[SLICE4:.+]] = tensor.extract_slice |
| // CHECK: %[[SLICE5:.+]] = tensor.extract_slice |
| // CHECK: %[[SLICE6:.+]] = tensor.extract_slice |
| // CHECK: %[[SLICE7:.+]] = tensor.extract_slice |
| // CHECK: %[[SLICE8:.+]] = tensor.extract_slice |
| // CHECK: func.call @microkernel(%[[SLICE4]], %[[SLICE5]], %[[SLICE6]], %[[SLICE7]], %[[SLICE8]]) |
| // CHECK-NOT: linalg.matmul |
| // CHECK-NOT: linalg.elementwise |
| // CHECK: scf.forall.in_parallel |
| // CHECK: linalg.elementwise kind=#linalg.elementwise_kind<max_signed> |
| // CHECK: scf.forall.in_parallel |
| |
| // Declaration of the "microkernel" function that we will be targeting. |
| func.func private @microkernel( |
| %lhs: tensor<4x512xf32>, |
| %rhs: tensor<512x4xf32>, |
| %bias: tensor<4x4xf32>, |
| %init: tensor<4x4xf32>, |
| %output: tensor<4x4xf32>) -> tensor<4x4xf32> |
| |
| module attributes {transform.with_named_sequence} { |
| transform.named_sequence @__transform_main( |
| %arg0: !transform.any_op, |
| %arg1: !transform.op<"linalg.matmul">, |
| %arg2: !transform.op<"linalg.elementwise">) { |
| // Since the %arg2 handle is associated with both elementwise operations, |
| // we need to split it into two handles so we can target only the second |
| // elementwise operation. |
| %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elementwise">) |
| -> (!transform.any_op, !transform.any_op) |
| |
| // The actual tiling transformation takes tile sizes as attributes. It produces a |
| // handle to the loop generated during tiling. |
| %tiled, %loop = transform.structured.tile_using_forall %max tile_sizes [8, 32] |
| : (!transform.any_op) -> (!transform.any_op, !transform.any_op) |
| |
| // We can now fuse the other operations into the loop. Here, we fuse |
| // operations one-by-one. This requires the operation that is being fused |
| // to define the value used within the loop, so the order of such fusions |
| // is important. We could also use "transform.merge_handles" to obtain |
| // a single handle to all operations and give it to `fuse_into_containing_op` |
| // that would take care of the ordering in this case. |
| %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop |
| : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) |
| %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2 |
| : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op) |
| |
| // Tile again to get the desired size. Note that this time this tiles the |
| // "add" operation and fuses matmul into the loop, but doesn't affect the |
| // "max" operation. This illustrates the precise targeting with the transform |
| // dialect. Otherwise, it is difficult to differentiate "add" and "max", both |
| // of which having the same kind. |
| %tiled_second, %loop_second = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4] |
| : (!transform.any_op) -> (!transform.any_op, !transform.any_op) |
| %matmul_fused_2, %loop_second_2 = |
| transform.structured.fuse_into_containing_op %matmul_fused into %loop_second |
| : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) |
| |
| // Since outlining is currently only implemented for region-holding operations |
| // such as loops, use tiling to size 1 to materialize the outer loop that is |
| // going to be outlined. |
| %_0, %loop_third = transform.structured.tile_using_forall %tiled_second tile_sizes [1] |
| : (!transform.any_op) -> (!transform.any_op, !transform.any_op) |
| %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third |
| : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) |
| %func, %call = transform.loop.outline %outline_target {func_name = "outlined"} |
| : (!transform.any_op) -> (!transform.any_op, !transform.any_op) |
| |
| // Rewrite the call target. |
| transform.my.change_call_target %call, "microkernel" : !transform.any_op |
| |
| transform.yield |
| } |
| } |