mlir/test/Examples/transform/ChH/full.mlir - llvm-project - Git at Google

 // RUN: mlir-opt %s --transform-interpreter \
 // RUN:             --test-transform-dialect-erase-schedule \
 // RUN:             --math-uplift-to-fma \
 // RUN:             --convert-bufferization-to-memref \
 // RUN:             --test-lower-to-llvm |\
 // RUN: FileCheck %s

 // Fixed-size tensor types to be used in convolution.
 // Named sizes are: N=5 OH=80 OW=100 F=C=128 KH=KW=3.
 // Input is NHWC.
 // Filter is CHWF.
 // Ouptut is NHWF.
 !tinput = tensor<5x82x102x128xf32>
 !tfilter = tensor<128x3x3x128xf32>
 !tbias = tensor<128xf32>
 !toutput = tensor<5x80x100x128xf32>

 // Function containing the convolution. Note that its arguments and results are
 // tensors annotated with attributes from the `bufferization` dialect. These
 // attributes hint the bufferization pass to assume buffers can be directly
 // used for these tensors without reshaping.
 func.func @conv(
     %input: !tinput {bufferization.writable = false,
                      bufferization.access = "read",
                      bufferization.buffer_layout =
                          affine_map<(d0,d1,d2,d3)->(d0,d1,d2,d3)>},
     %filter: !tfilter {bufferization.writable = false,
                       bufferization.access = "read",
                       bufferization.buffer_layout =
                           affine_map<(d0,d1,d2,d3)->(d0,d1,d2,d3)>},
     %bias: !tbias {bufferization.writable = false,
                    bufferization.access = "read",
                    bufferization.buffer_layout = affine_map<(d0)->(d0)>},
     %output: !toutput {bufferization.writable = true,
                        bufferization.buffer_layout =
                            affine_map<(d0,d1,d2,d3)->(d0,d1,d2,d3)>,
                        bufferization.access = "write"}) -> !toutput
   // This requests a C-compatible interface to be emitted for the function
   // when translating to LLVM IR.
   attributes { llvm.emit_c_interface }
 {
   // Bias. Using a named Linalg operation for brevity.
   %bias_init = tensor.empty() : !toutput
   %biased = linalg.broadcast ins(%bias : !tbias)
     outs(%bias_init : !toutput) dimensions = [0, 1, 2]

   // Convolution proper. While Linalg has named operations for 2D convolutions,
   // the one in the Halide example has an uncommon order of filter dimensions
   // and is not supported. It also takes the fitler as first argument. This
   // code recreates it faithfully using the generic form.
   %convolved = linalg.generic {
     iterator_types = ["parallel", "parallel", "parallel", "parallel",
                       "reduction", "reduction", "reduction"],
     indexing_maps = [
       affine_map<(n, y, x, c, rz, ry, rx) -> (rx, rz, ry, c)>,
       affine_map<(n, y, x, c, rz, ry, rx) -> (n, y+rz, x+ry, rx)>,
       affine_map<(n, y, x, c, rz, ry, rx) -> (n, y, x, c)>
     ]
   } ins(%filter, %input: !tfilter, !tinput) outs(%biased : !toutput) {
   ^bb0(%in: f32, %f: f32, %b: f32):
     // Note the fastmath attributes that allow operations to be recombined into
     //   %0 = math.fma %in, %f, %b : f32
     // later on and to reorder reductions.
     %m1 = arith.mulf %in, %f  {fastmath = #arith.fastmath<fast>} : f32
     %0 = arith.addf %b, %m1  {fastmath = #arith.fastmath<fast>} : f32
     linalg.yield %0 : f32
   } -> !toutput

   // ReLU is just a max(0, x).
   %c0 = arith.constant 0.0 : f32
   %relued = linalg.generic {
     iterator_types = ["parallel", "parallel", "parallel", "parallel"],
     indexing_maps = [
       affine_map<(d0, d1, d2, d3) -> ()>,
       affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
       affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
     ]
   } ins(%c0, %convolved : f32, !toutput)
     outs(%output : !toutput) {
   ^bb0(%cst: f32, %in: f32, %out: f32):
     %0 = llvm.intr.maxnum(%cst, %in) : (f32, f32) -> f32
     linalg.yield %0 : f32
   } -> !toutput

   return %relued : !toutput
 }

 // Module containing the transformation script to be applied. The attribute
 // is required to correctly verify the use of named (macro-like) sequences.
 module attributes { transform.with_named_sequence } {
   // Apply transformations in a sequence to recreate the following Halide
   // schedule:
   //
   //   Var co, ci, xo, xi;
   //   relu.split(c, co, ci, vec * tile_w)
   //       .split(x, xo, xi, tile_h)
   //       .reorder(ci, xi, xo, y, n, co)
   //       .vectorize(ci, vec)
   //       .unroll(ci)
   //       .unroll(xi);
   //   conv.compute_at(relu, xo)
   //       .vectorize(c, vec)
   //       .unroll(c)
   //       .unroll(x)
   //       .unroll(y)
   //       .update()
   //       .reorder(c, x, y, r.x, r.y, r.z, n)
   //       .vectorize(c, vec)
   //       .unroll(c)
   //       .unroll(x)
   //       .unroll(y)
   //       .unroll(r.x, 2);
   //
   // where tile_w = 4, tile_h = 5, vec = 16. Note that unroll(y) and unroll(r.x)
   // have no effect on the Halide IR as of 294f80c49bf3bb8582446613c25fcce03b82.
   // Also note that the order of dimensions in Halide is inverted, e.g., co and
   // n are the outermost loops in the respective reorder directives.
   transform.named_sequence @__transform_main(
   // This argument will point to the top-level module.
       %arg0: !transform.any_op) {

     // 1. Find the operations we are going to transform usnig their names. This
     // is a simplistic approach that works when there are few operations in the
     // IR to be transformed. More complex scenarios should rely on operations
     // with `transform.match` prefix that are out of scope for this chapter.
     %bias = transform.structured.match ops{["linalg.broadcast"]} in %arg0
       : (!transform.any_op) -> !transform.any_op
     %generics = transform.structured.match ops{["linalg.generic"]} in %arg0
       : (!transform.any_op) -> !transform.any_op
     %conv, %relu = transform.split_handle %generics
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)

     // 2. Initial tiling to start producing the loop structure. Note that the
     // linalg.generic operation has the implicit loop order (n, y, x, c). Since
     // the desired order of dimensions is (co, n, y, xo, xi, ci), we first tile
     // only the c dimension to materialize the outermost co loop, and then tile
     // the other dimensions since they are already in the expected order. Tiling
     // by 1 produces the loop that iterates along the entire dimension. Tiling
     // by 0 does not produce a loop. The size 64 is chosen as tiling by 4*16
     // where 16 is the AVX512 vector length. Note that structured tiling doesn't
     // remove the dimensions that became trivial (unit size) so the resulting
     // sturucture is technically (co, no=n, yo=y, xo, [ni=1, yi=1, xi, ci])
     // where brackets indicate implicit loops of the `linalg.generic` operation
     // inside the loops produced by tiling.
     //
     //                                                             [n  y  x  c]
     %relu2, %co = transform.structured.tile_using_forall %relu
                                                         tile_sizes [0, 0, 0, 64]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
     %relu3, %n_y_xo = transform.structured.tile_using_forall %relu2
                                                         tile_sizes [1, 1, 5, 0]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)

     // Compute_at is actually fusion into the given loop (given that we start
     // with totally fissioned form, Halide starts with a fused form by reusing
     // the loop iterators).
     %conv2, %co2 = transform.structured.fuse_into_containing_op %conv into %co
       : (!transform.any_op, !transform.any_op)
       -> (!transform.any_op, !transform.any_op)
     %conv3, %n_y_xo2 = transform.structured.fuse_into_containing_op %conv2
       into %n_y_xo
       : (!transform.any_op, !transform.any_op)
       -> (!transform.any_op, !transform.any_op)

     // Also fuse the bias that we represent as a separate operation and Halide
     // represents as the "pure" (as opposed to "update") part of the conv
     // expression. Note that fusion consumes both handles and produces new
     // handles for chaining purposes.
     %bias2, %co3 = transform.structured.fuse_into_containing_op %bias into %co2
       : (!transform.any_op, !transform.any_op)
       -> (!transform.any_op, !transform.any_op)
     %bias3, %n_y_xo3 = transform.structured.fuse_into_containing_op %bias2
       into %n_y_xo2
       : (!transform.any_op, !transform.any_op)
       -> (!transform.any_op, !transform.any_op)

     // Clean up the result of fusion, which mechanically duplicates the producer
     // operation in the consumer loop without removing the original operation.
     // The original operation is now "dead": it has no uses and no side effects
     // so it can be removed by dead-code elimination (DCE) that runs as part of
     // pattern rewriting. The transform dialect allows to apply a combination
     // of named pattern sets, exposed as operations, in one sweep to an
     // isolated-from-above container payload operation. Note that we don't
     // actually need any patterns for DCE to run, just trigger the rewriting.
     //
     // This step is optional. The transformation can continue without it and
     // produce the same final IR, but makes it easier to manually examine the
     // intermediate stages.
     %f00 = transform.structured.match ops{["func.func"]} in %arg0
       : (!transform.any_op) -> !transform.any_op
     transform.apply_patterns to %f00 {
     } : !transform.any_op

     // The loop reordering requested for the convolution operation requires
     // putting reduction loops (r.z, r.y. r.x) before the "inner" loops xi, ci.
     // The "inner" loops are still implicit as part of the linalg.generic
     // operation, and we need to materialize reduction loops around it by tiling
     // with size 1. Since we are producing reduction loops, we indicate that we
     // are tiling a reduction and request a sequential `scf.for` loops (parallel
     // reductions are supported by `scf.forall`, but we don't need those here).
     //
     // This transform operation is more capable than merely producing
     // (reduction) loops: the transformed code performs `tile_size` partial
     // reductions of `N / tile_size` elements, potentially in parallel by
     // changing the dimension kind of the structured operation inside the loop,
     // and then performs a final reduction of these partial results by producing
     // a new “combiner” structured operation after the loops. In our case,
     // tile_size = 1 along all dimensions, so the reduction is entirely
     // performed by the generated loops. The combiner structured operation is
     // still produced and adds up the reduction result with the initial value.
     %red_fill, %conv4, %combining, %rz_ry_rx
     = transform.structured.tile_reduction_using_for %conv3 by
     //            n  y  x  c  rz ry rx
       tile_sizes=[0, 0, 0, 0, 1, 1, 1]
       : (!transform.any_op)
       -> (!transform.any_op, !transform.any_op, !transform.any_op,
           !transform.any_op)

     // At this point, the inner Linalg operations have implicit iteration spaces
     // of 5x64 size, with some additional unit-size dimensions. Completely
     // replicating Halide schedule would require materializing the loops with
     // 5 and 4 iterations, respectively, unrolling those loops and marking the
     // remaining 16-point iteration space for vectorization.
     //
     // This is unnecessary in MLIR that supports multi-dimensional vectors,
     // which will be decomposed into target-specific sizes during the lowering.
     // Therefore, this schedule stops here.

     // Transform the named broadcast operation used for bias into the generic
     // form before vectorization to prevent special cases from kicking in.
     transform.structured.generalize %bias3
       : (!transform.any_op) -> !transform.any_op

     // Use the named macro to perform most of the lowering.
     transform.include @lower failures(propagate) (%arg0)
       : (!transform.any_op) -> ()
     transform.yield
   }

   // Named sequence of transformations is a macro-like object that can be
   // included from another place in the transform dialect, but doesn't allow for
   // recursion. This can be reused in other scenarios.
   transform.named_sequence @lower(
       %arg0: !transform.any_op {transform.consumed}) {
     %f00 = transform.structured.match ops{["func.func"]} in %arg0
       : (!transform.any_op) -> !transform.any_op

     // Simplify the code as tiling and fusion may have produced a lot of
     // operations computing tensor subsets and loop ranges, some of which may be
     // duplicated or excessively complex. Simplification involving
     // canonicalization, common subexpression elimination, loop invariant code
     // motion and various rewrite patterns can be applied directly from the
     // transform dialect. Furthermore, an arbitrary combination of rewrite
     // patterns can be applied in one sweep to a given scope, a functionality
     // that cannot be achieved with conventional compiler passes that apply each
     // group of patterns separately (at least without creating a new pass for
     // each combination of pattern groups).
     transform.apply_patterns to %f00 {
       transform.apply_patterns.canonicalization
       transform.apply_patterns.linalg.tiling_canonicalization
     } : !transform.any_op
     transform.apply_cse to %f00 : !transform.any_op
     %all_loops = transform.structured.match interface{LoopLikeInterface}
       in %arg0
       : (!transform.any_op) -> !transform.any_op
     transform.apply_licm to %all_loops : !transform.any_op

     // Tiling-by-one as a way of materializing loops produced operations
     // processing 4+D types where only a handful of dimension isn’t unit-sized,
     // e.g., tensor<1x1x1x5x64xf32> where 5 and 64 are tile sizes. Remove such
     // unit dimensions before vectorization, for clarity.
     transform.apply_patterns to %f00 {
       transform.apply_patterns.linalg.fold_unit_extent_dims_via_reshapes
     } : !transform.any_op

     // Vectorize the remaining non-unit dimensions in structured operations.
     // This essentially rewrites operations on `tensor<5x64xf32>` into
     // opreations on `vector<5x64xf32>`. Further lowering in MLIR and LLVM will
     // decompose this into a sequence of operations on single-dimensional
     // vectors of the platform-relevant size, e.g., `vector<16xf32>` for AVX512.
     // High-level vector primitives, such as `vector.transpose` and
     // `vector.broadcast` can be introduced at this stage. They will be later
     // lowered to sequences of lower-level primitives such as `vector.shuffle`
     // depending on the selected lowering strategy.
     %fv = transform.structured.vectorize_children_and_apply_patterns %f00
       : (!transform.any_op) -> !transform.any_op

     // Vectorization may have created new opportunities for cleanups. In
     // particular, tensor subsetting operations can be composed with vector
     // operations, and vector transfer (multi-dimensional load/store) operations
     // can be recombined and hoisted out of loops.
     transform.apply_patterns to %fv {
       transform.apply_patterns.canonicalization
       transform.apply_patterns.tensor.fold_tensor_subset_ops_into_vector_transfers
     } : !transform.any_op
     transform.apply_cse to %fv : !transform.any_op
     transform.structured.hoist_redundant_vector_transfers %fv
       : (!transform.any_op) -> !transform.any_op

     // Apply bufferization that rewrites the remaining operations on tensors
     // as operations on structured buffer (memref) types, including the function
     // API. MLIR bufferization uses destination-passing style meaning that a
     // buffer is shared between one of the operation's operands and its result.
     //
     // Since bufferization rewrites function signatures, it is applied as a
     // module-wise transformation. Therefore, it invalidates all previously
     // defined handles. Bufferization is usually a late step in the
     // transformation process, so invalidation is not an issue. However, if
     // other transformations, such as loop unrolling, are required after
     // bufferization, new handles should be produced using the match operations.
     //
     // One-shot bufferization itself does not produce buffer deallocations,
     // which may lead to leaks. So we have to run the buffer deallocation pass
     // pipeline to avoid them. Note that the transform dialect seamlessly runs
     // named passes and pass pipelines: if desired, one could replace complex
     // --pass-pipeline expressions with operations. Note that we apply the
     // pipeline to functions rather than entire module to avoid running it
     // on the transform IR that is contained in the module.
     %arg1 = transform.bufferization.one_shot_bufferize %arg0 {
       bufferize_function_boundaries = true,
       function_boundary_type_conversion = 1 : i32 }
       : (!transform.any_op) -> !transform.any_op
     %f = transform.structured.match ops{["func.func"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
     transform.apply_registered_pass "buffer-deallocation-pipeline" to %f
       : (!transform.any_op) -> !transform.any_op

     // Apply general canonicalization and CSE to each function after
     // bufferization as new simplification opportunities may have appeared.
     %fb = transform.structured.match ops{["func.func"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
     transform.apply_patterns to %fb {
       transform.apply_patterns.canonicalization
     } : !transform.any_op
     transform.apply_cse to %fb : !transform.any_op

     // Lower complex, multidimensional vector operations into simpler
     // primitives. This particular selection of the pattern groups corresponds
     // to vector dialect operations present in the payload IR at this stage.
     // Many of these groups can be parameterized to use different strategies or
     // lower-level primitives offering performance trade-offs. In this case, we
     // are selecting the simplest strategies.
     transform.apply_patterns to %fb {
       transform.apply_patterns.vector.lower_contraction
         lowering_strategy = parallelarith
       transform.apply_patterns.vector.lower_transfer
         max_transfer_rank = 1
       transform.apply_patterns.vector.lower_transpose
         lowering_strategy = eltwise
       transform.apply_patterns.vector.lower_shape_cast
     } : !transform.any_op

     // These patterns apply in a separate sweep to avoid transfer-to-scf
     // patterns overlap with lower-transfer patterns as they apply to the same
     // kind of operations. These patterns may produce local allocations to act
     // as temporary caches deep inside loops, which could lead to catastrophic
     // performance. Such allocations are moved onto the stack and hoisted from
     // all the surrounding loops.
     transform.apply_patterns to %fb {
       transform.apply_patterns.vector.transfer_to_scf
       transform.apply_patterns.memref.alloc_to_alloca
       } : !transform.any_op
     transform.bufferization.buffer_loop_hoisting %fb : !transform.any_op

     // A final round of cleanups additionally includes patterns to simplify
     // buffer aliasing operations that may have been introduced during
     // bufferization and could result in excessively complex address
     // computation.
     transform.apply_patterns to %fb {
       transform.apply_patterns.memref.fold_memref_alias_ops
       transform.apply_patterns.canonicalization
     } : !transform.any_op
     transform.apply_cse to %fb : !transform.any_op

     transform.yield
   }
 }

 // The core computation, at the LLVM dialect level, must correspond to five
 // immediately adjacent fma on vector<64xf32>.

 // CHECK:      %[[R0:.+]] = llvm.mlir.undef : !llvm.array<5 x vector<64xf32>>

 // CHECK:      %[[V:.+]] = llvm.load %{{.*}} : !llvm.ptr -> !llvm.array<5 x vector<64xf32>>
 // CHECK-NEXT: %[[LINE0:.+]] = llvm.extractvalue %[[V]][0] : !llvm.array<5 x vector<64xf32>>
 // CHECK-NEXT: %[[FMA0:.+]] = llvm.intr.fma(%{{.*}}, %{{.*}}, %[[LINE0]])
 // CHECK-SAME: -> vector<64xf32>
 // CHECK-NEXT: %[[R1:.+]] = llvm.insertvalue %[[FMA0]], %[[R0]][0]

 // CHECK-NEXT: %[[LINE1:.+]] = llvm.extractvalue %[[V]][1] : !llvm.array<5 x vector<64xf32>>
 // CHECK-NEXT: %[[FMA1:.+]] = llvm.intr.fma(%{{.*}}, %{{.*}}, %[[LINE1]])
 // CHECK-SAME: -> vector<64xf32>
 // CHECK-NEXT: %[[R2:.+]] = llvm.insertvalue %[[FMA1]], %[[R1]][1]

 // CHECK-NEXT: %[[LINE2:.+]] = llvm.extractvalue %[[V]][2] : !llvm.array<5 x vector<64xf32>>
 // CHECK-NEXT: %[[FMA2:.+]] = llvm.intr.fma(%{{.*}}, %{{.*}}, %[[LINE2]])
 // CHECK-SAME: -> vector<64xf32>
 // CHECK-NEXT: %[[R3:.+]] = llvm.insertvalue %[[FMA2]], %[[R2]][2]

 // CHECK-NEXT: %[[LINE3:.+]] = llvm.extractvalue %[[V]][3] : !llvm.array<5 x vector<64xf32>>
 // CHECK-NEXT: %[[FMA3:.+]] = llvm.intr.fma(%{{.*}}, %{{.*}}, %[[LINE3]])
 // CHECK-SAME: -> vector<64xf32>
 // CHECK-NEXT: %[[R4:.+]] = llvm.insertvalue %[[FMA3]], %[[R3]][3]

 // CHECK-NEXT: %[[LINE4:.+]] = llvm.extractvalue %[[V]][4] : !llvm.array<5 x vector<64xf32>>
 // CHECK-NEXT: %[[FMA4:.+]] = llvm.intr.fma(%{{.*}}, %{{.*}}, %[[LINE4]])
 // CHECK-SAME: -> vector<64xf32>
 // CHECK-NEXT: %[[R5:.+]] = llvm.insertvalue %[[FMA4]], %[[R4]][4]
	// RUN: mlir-opt %s --transform-interpreter \
	// RUN: --test-transform-dialect-erase-schedule \
	// RUN: --math-uplift-to-fma \
	// RUN: --convert-bufferization-to-memref \
	// RUN: --test-lower-to-llvm \|\
	// RUN: FileCheck %s

	// Fixed-size tensor types to be used in convolution.
	// Named sizes are: N=5 OH=80 OW=100 F=C=128 KH=KW=3.
	// Input is NHWC.
	// Filter is CHWF.
	// Ouptut is NHWF.
	!tinput = tensor<5x82x102x128xf32>
	!tfilter = tensor<128x3x3x128xf32>
	!tbias = tensor<128xf32>
	!toutput = tensor<5x80x100x128xf32>

	// Function containing the convolution. Note that its arguments and results are
	// tensors annotated with attributes from the `bufferization` dialect. These
	// attributes hint the bufferization pass to assume buffers can be directly
	// used for these tensors without reshaping.
	func.func @conv(
	%input: !tinput {bufferization.writable = false,
	bufferization.access = "read",
	bufferization.buffer_layout =
	affine_map<(d0,d1,d2,d3)->(d0,d1,d2,d3)>},
	%filter: !tfilter {bufferization.writable = false,
	bufferization.access = "read",
	bufferization.buffer_layout =
	affine_map<(d0,d1,d2,d3)->(d0,d1,d2,d3)>},
	%bias: !tbias {bufferization.writable = false,
	bufferization.access = "read",
	bufferization.buffer_layout = affine_map<(d0)->(d0)>},
	%output: !toutput {bufferization.writable = true,
	bufferization.buffer_layout =
	affine_map<(d0,d1,d2,d3)->(d0,d1,d2,d3)>,
	bufferization.access = "write"}) -> !toutput
	// This requests a C-compatible interface to be emitted for the function
	// when translating to LLVM IR.
	attributes { llvm.emit_c_interface }
	{
	// Bias. Using a named Linalg operation for brevity.
	%bias_init = tensor.empty() : !toutput
	%biased = linalg.broadcast ins(%bias : !tbias)
	outs(%bias_init : !toutput) dimensions = [0, 1, 2]

	// Convolution proper. While Linalg has named operations for 2D convolutions,
	// the one in the Halide example has an uncommon order of filter dimensions
	// and is not supported. It also takes the fitler as first argument. This
	// code recreates it faithfully using the generic form.
	%convolved = linalg.generic {
	iterator_types = ["parallel", "parallel", "parallel", "parallel",
	"reduction", "reduction", "reduction"],
	indexing_maps = [
	affine_map<(n, y, x, c, rz, ry, rx) -> (rx, rz, ry, c)>,
	affine_map<(n, y, x, c, rz, ry, rx) -> (n, y+rz, x+ry, rx)>,
	affine_map<(n, y, x, c, rz, ry, rx) -> (n, y, x, c)>
	]
	} ins(%filter, %input: !tfilter, !tinput) outs(%biased : !toutput) {
	^bb0(%in: f32, %f: f32, %b: f32):
	// Note the fastmath attributes that allow operations to be recombined into
	// %0 = math.fma %in, %f, %b : f32
	// later on and to reorder reductions.
	%m1 = arith.mulf %in, %f {fastmath = #arith.fastmath<fast>} : f32
	%0 = arith.addf %b, %m1 {fastmath = #arith.fastmath<fast>} : f32
	linalg.yield %0 : f32
	} -> !toutput

	// ReLU is just a max(0, x).
	%c0 = arith.constant 0.0 : f32
	%relued = linalg.generic {
	iterator_types = ["parallel", "parallel", "parallel", "parallel"],
	indexing_maps = [
	affine_map<(d0, d1, d2, d3) -> ()>,
	affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
	affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	]
	} ins(%c0, %convolved : f32, !toutput)
	outs(%output : !toutput) {
	^bb0(%cst: f32, %in: f32, %out: f32):
	%0 = llvm.intr.maxnum(%cst, %in) : (f32, f32) -> f32
	linalg.yield %0 : f32
	} -> !toutput

	return %relued : !toutput
	}

	// Module containing the transformation script to be applied. The attribute
	// is required to correctly verify the use of named (macro-like) sequences.
	module attributes { transform.with_named_sequence } {
	// Apply transformations in a sequence to recreate the following Halide
	// schedule:
	//
	// Var co, ci, xo, xi;
	// relu.split(c, co, ci, vec * tile_w)
	// .split(x, xo, xi, tile_h)
	// .reorder(ci, xi, xo, y, n, co)
	// .vectorize(ci, vec)
	// .unroll(ci)
	// .unroll(xi);
	// conv.compute_at(relu, xo)
	// .vectorize(c, vec)
	// .unroll(c)
	// .unroll(x)
	// .unroll(y)
	// .update()
	// .reorder(c, x, y, r.x, r.y, r.z, n)
	// .vectorize(c, vec)
	// .unroll(c)
	// .unroll(x)
	// .unroll(y)
	// .unroll(r.x, 2);
	//
	// where tile_w = 4, tile_h = 5, vec = 16. Note that unroll(y) and unroll(r.x)
	// have no effect on the Halide IR as of 294f80c49bf3bb8582446613c25fcce03b82.
	// Also note that the order of dimensions in Halide is inverted, e.g., co and
	// n are the outermost loops in the respective reorder directives.
	transform.named_sequence @__transform_main(
	// This argument will point to the top-level module.
	%arg0: !transform.any_op) {

	// 1. Find the operations we are going to transform usnig their names. This
	// is a simplistic approach that works when there are few operations in the
	// IR to be transformed. More complex scenarios should rely on operations
	// with `transform.match` prefix that are out of scope for this chapter.
	%bias = transform.structured.match ops{["linalg.broadcast"]} in %arg0
	: (!transform.any_op) -> !transform.any_op
	%generics = transform.structured.match ops{["linalg.generic"]} in %arg0
	: (!transform.any_op) -> !transform.any_op
	%conv, %relu = transform.split_handle %generics
	: (!transform.any_op) -> (!transform.any_op, !transform.any_op)

	// 2. Initial tiling to start producing the loop structure. Note that the
	// linalg.generic operation has the implicit loop order (n, y, x, c). Since
	// the desired order of dimensions is (co, n, y, xo, xi, ci), we first tile
	// only the c dimension to materialize the outermost co loop, and then tile
	// the other dimensions since they are already in the expected order. Tiling
	// by 1 produces the loop that iterates along the entire dimension. Tiling
	// by 0 does not produce a loop. The size 64 is chosen as tiling by 4*16
	// where 16 is the AVX512 vector length. Note that structured tiling doesn't
	// remove the dimensions that became trivial (unit size) so the resulting
	// sturucture is technically (co, no=n, yo=y, xo, [ni=1, yi=1, xi, ci])
	// where brackets indicate implicit loops of the `linalg.generic` operation
	// inside the loops produced by tiling.
	//
	// [n y x c]
	%relu2, %co = transform.structured.tile_using_forall %relu
	tile_sizes [0, 0, 0, 64]
	: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
	%relu3, %n_y_xo = transform.structured.tile_using_forall %relu2
	tile_sizes [1, 1, 5, 0]
	: (!transform.any_op) -> (!transform.any_op, !transform.any_op)

	// Compute_at is actually fusion into the given loop (given that we start
	// with totally fissioned form, Halide starts with a fused form by reusing
	// the loop iterators).
	%conv2, %co2 = transform.structured.fuse_into_containing_op %conv into %co
	: (!transform.any_op, !transform.any_op)
	-> (!transform.any_op, !transform.any_op)
	%conv3, %n_y_xo2 = transform.structured.fuse_into_containing_op %conv2
	into %n_y_xo
	: (!transform.any_op, !transform.any_op)
	-> (!transform.any_op, !transform.any_op)

	// Also fuse the bias that we represent as a separate operation and Halide
	// represents as the "pure" (as opposed to "update") part of the conv
	// expression. Note that fusion consumes both handles and produces new
	// handles for chaining purposes.
	%bias2, %co3 = transform.structured.fuse_into_containing_op %bias into %co2
	: (!transform.any_op, !transform.any_op)
	-> (!transform.any_op, !transform.any_op)
	%bias3, %n_y_xo3 = transform.structured.fuse_into_containing_op %bias2
	into %n_y_xo2
	: (!transform.any_op, !transform.any_op)
	-> (!transform.any_op, !transform.any_op)

	// Clean up the result of fusion, which mechanically duplicates the producer
	// operation in the consumer loop without removing the original operation.
	// The original operation is now "dead": it has no uses and no side effects
	// so it can be removed by dead-code elimination (DCE) that runs as part of
	// pattern rewriting. The transform dialect allows to apply a combination
	// of named pattern sets, exposed as operations, in one sweep to an
	// isolated-from-above container payload operation. Note that we don't
	// actually need any patterns for DCE to run, just trigger the rewriting.
	//
	// This step is optional. The transformation can continue without it and
	// produce the same final IR, but makes it easier to manually examine the
	// intermediate stages.
	%f00 = transform.structured.match ops{["func.func"]} in %arg0
	: (!transform.any_op) -> !transform.any_op
	transform.apply_patterns to %f00 {
	} : !transform.any_op

	// The loop reordering requested for the convolution operation requires
	// putting reduction loops (r.z, r.y. r.x) before the "inner" loops xi, ci.
	// The "inner" loops are still implicit as part of the linalg.generic
	// operation, and we need to materialize reduction loops around it by tiling
	// with size 1. Since we are producing reduction loops, we indicate that we
	// are tiling a reduction and request a sequential `scf.for` loops (parallel
	// reductions are supported by `scf.forall`, but we don't need those here).
	//
	// This transform operation is more capable than merely producing
	// (reduction) loops: the transformed code performs `tile_size` partial
	// reductions of `N / tile_size` elements, potentially in parallel by
	// changing the dimension kind of the structured operation inside the loop,
	// and then performs a final reduction of these partial results by producing
	// a new “combiner” structured operation after the loops. In our case,
	// tile_size = 1 along all dimensions, so the reduction is entirely
	// performed by the generated loops. The combiner structured operation is
	// still produced and adds up the reduction result with the initial value.
	%red_fill, %conv4, %combining, %rz_ry_rx
	= transform.structured.tile_reduction_using_for %conv3 by
	// n y x c rz ry rx
	tile_sizes=[0, 0, 0, 0, 1, 1, 1]
	: (!transform.any_op)
	-> (!transform.any_op, !transform.any_op, !transform.any_op,
	!transform.any_op)

	// At this point, the inner Linalg operations have implicit iteration spaces
	// of 5x64 size, with some additional unit-size dimensions. Completely
	// replicating Halide schedule would require materializing the loops with
	// 5 and 4 iterations, respectively, unrolling those loops and marking the
	// remaining 16-point iteration space for vectorization.
	//
	// This is unnecessary in MLIR that supports multi-dimensional vectors,
	// which will be decomposed into target-specific sizes during the lowering.
	// Therefore, this schedule stops here.

	// Transform the named broadcast operation used for bias into the generic
	// form before vectorization to prevent special cases from kicking in.
	transform.structured.generalize %bias3
	: (!transform.any_op) -> !transform.any_op

	// Use the named macro to perform most of the lowering.
	transform.include @lower failures(propagate) (%arg0)
	: (!transform.any_op) -> ()
	transform.yield
	}

	// Named sequence of transformations is a macro-like object that can be
	// included from another place in the transform dialect, but doesn't allow for
	// recursion. This can be reused in other scenarios.
	transform.named_sequence @lower(
	%arg0: !transform.any_op {transform.consumed}) {
	%f00 = transform.structured.match ops{["func.func"]} in %arg0
	: (!transform.any_op) -> !transform.any_op

	// Simplify the code as tiling and fusion may have produced a lot of
	// operations computing tensor subsets and loop ranges, some of which may be
	// duplicated or excessively complex. Simplification involving
	// canonicalization, common subexpression elimination, loop invariant code
	// motion and various rewrite patterns can be applied directly from the
	// transform dialect. Furthermore, an arbitrary combination of rewrite
	// patterns can be applied in one sweep to a given scope, a functionality
	// that cannot be achieved with conventional compiler passes that apply each
	// group of patterns separately (at least without creating a new pass for
	// each combination of pattern groups).
	transform.apply_patterns to %f00 {
	transform.apply_patterns.canonicalization
	transform.apply_patterns.linalg.tiling_canonicalization
	} : !transform.any_op
	transform.apply_cse to %f00 : !transform.any_op
	%all_loops = transform.structured.match interface{LoopLikeInterface}
	in %arg0
	: (!transform.any_op) -> !transform.any_op
	transform.apply_licm to %all_loops : !transform.any_op

	// Tiling-by-one as a way of materializing loops produced operations
	// processing 4+D types where only a handful of dimension isn’t unit-sized,
	// e.g., tensor<1x1x1x5x64xf32> where 5 and 64 are tile sizes. Remove such
	// unit dimensions before vectorization, for clarity.
	transform.apply_patterns to %f00 {
	transform.apply_patterns.linalg.fold_unit_extent_dims_via_reshapes
	} : !transform.any_op

	// Vectorize the remaining non-unit dimensions in structured operations.
	// This essentially rewrites operations on `tensor<5x64xf32>` into
	// opreations on `vector<5x64xf32>`. Further lowering in MLIR and LLVM will
	// decompose this into a sequence of operations on single-dimensional
	// vectors of the platform-relevant size, e.g., `vector<16xf32>` for AVX512.
	// High-level vector primitives, such as `vector.transpose` and
	// `vector.broadcast` can be introduced at this stage. They will be later
	// lowered to sequences of lower-level primitives such as `vector.shuffle`
	// depending on the selected lowering strategy.
	%fv = transform.structured.vectorize_children_and_apply_patterns %f00
	: (!transform.any_op) -> !transform.any_op

	// Vectorization may have created new opportunities for cleanups. In
	// particular, tensor subsetting operations can be composed with vector
	// operations, and vector transfer (multi-dimensional load/store) operations
	// can be recombined and hoisted out of loops.
	transform.apply_patterns to %fv {
	transform.apply_patterns.canonicalization
	transform.apply_patterns.tensor.fold_tensor_subset_ops_into_vector_transfers
	} : !transform.any_op
	transform.apply_cse to %fv : !transform.any_op
	transform.structured.hoist_redundant_vector_transfers %fv
	: (!transform.any_op) -> !transform.any_op

	// Apply bufferization that rewrites the remaining operations on tensors
	// as operations on structured buffer (memref) types, including the function
	// API. MLIR bufferization uses destination-passing style meaning that a
	// buffer is shared between one of the operation's operands and its result.
	//
	// Since bufferization rewrites function signatures, it is applied as a
	// module-wise transformation. Therefore, it invalidates all previously
	// defined handles. Bufferization is usually a late step in the
	// transformation process, so invalidation is not an issue. However, if
	// other transformations, such as loop unrolling, are required after
	// bufferization, new handles should be produced using the match operations.
	//
	// One-shot bufferization itself does not produce buffer deallocations,
	// which may lead to leaks. So we have to run the buffer deallocation pass
	// pipeline to avoid them. Note that the transform dialect seamlessly runs
	// named passes and pass pipelines: if desired, one could replace complex
	// --pass-pipeline expressions with operations. Note that we apply the
	// pipeline to functions rather than entire module to avoid running it
	// on the transform IR that is contained in the module.
	%arg1 = transform.bufferization.one_shot_bufferize %arg0 {
	bufferize_function_boundaries = true,
	function_boundary_type_conversion = 1 : i32 }
	: (!transform.any_op) -> !transform.any_op
	%f = transform.structured.match ops{["func.func"]} in %arg1
	: (!transform.any_op) -> !transform.any_op
	transform.apply_registered_pass "buffer-deallocation-pipeline" to %f
	: (!transform.any_op) -> !transform.any_op

	// Apply general canonicalization and CSE to each function after
	// bufferization as new simplification opportunities may have appeared.
	%fb = transform.structured.match ops{["func.func"]} in %arg1
	: (!transform.any_op) -> !transform.any_op
	transform.apply_patterns to %fb {
	transform.apply_patterns.canonicalization
	} : !transform.any_op
	transform.apply_cse to %fb : !transform.any_op

	// Lower complex, multidimensional vector operations into simpler
	// primitives. This particular selection of the pattern groups corresponds
	// to vector dialect operations present in the payload IR at this stage.
	// Many of these groups can be parameterized to use different strategies or
	// lower-level primitives offering performance trade-offs. In this case, we
	// are selecting the simplest strategies.
	transform.apply_patterns to %fb {
	transform.apply_patterns.vector.lower_contraction
	lowering_strategy = parallelarith
	transform.apply_patterns.vector.lower_transfer
	max_transfer_rank = 1
	transform.apply_patterns.vector.lower_transpose
	lowering_strategy = eltwise
	transform.apply_patterns.vector.lower_shape_cast
	} : !transform.any_op

	// These patterns apply in a separate sweep to avoid transfer-to-scf
	// patterns overlap with lower-transfer patterns as they apply to the same
	// kind of operations. These patterns may produce local allocations to act
	// as temporary caches deep inside loops, which could lead to catastrophic
	// performance. Such allocations are moved onto the stack and hoisted from
	// all the surrounding loops.
	transform.apply_patterns to %fb {
	transform.apply_patterns.vector.transfer_to_scf
	transform.apply_patterns.memref.alloc_to_alloca
	} : !transform.any_op
	transform.bufferization.buffer_loop_hoisting %fb : !transform.any_op

	// A final round of cleanups additionally includes patterns to simplify
	// buffer aliasing operations that may have been introduced during
	// bufferization and could result in excessively complex address
	// computation.
	transform.apply_patterns to %fb {
	transform.apply_patterns.memref.fold_memref_alias_ops
	transform.apply_patterns.canonicalization
	} : !transform.any_op
	transform.apply_cse to %fb : !transform.any_op

	transform.yield
	}
	}

	// The core computation, at the LLVM dialect level, must correspond to five
	// immediately adjacent fma on vector<64xf32>.

	// CHECK: %[[R0:.+]] = llvm.mlir.undef : !llvm.array<5 x vector<64xf32>>

	// CHECK: %[[V:.+]] = llvm.load %{{.*}} : !llvm.ptr -> !llvm.array<5 x vector<64xf32>>
	// CHECK-NEXT: %[[LINE0:.+]] = llvm.extractvalue %[[V]][0] : !llvm.array<5 x vector<64xf32>>
	// CHECK-NEXT: %[[FMA0:.+]] = llvm.intr.fma(%{{.}}, %{{.}}, %[[LINE0]])
	// CHECK-SAME: -> vector<64xf32>
	// CHECK-NEXT: %[[R1:.+]] = llvm.insertvalue %[[FMA0]], %[[R0]][0]

	// CHECK-NEXT: %[[LINE1:.+]] = llvm.extractvalue %[[V]][1] : !llvm.array<5 x vector<64xf32>>
	// CHECK-NEXT: %[[FMA1:.+]] = llvm.intr.fma(%{{.}}, %{{.}}, %[[LINE1]])
	// CHECK-SAME: -> vector<64xf32>
	// CHECK-NEXT: %[[R2:.+]] = llvm.insertvalue %[[FMA1]], %[[R1]][1]

	// CHECK-NEXT: %[[LINE2:.+]] = llvm.extractvalue %[[V]][2] : !llvm.array<5 x vector<64xf32>>
	// CHECK-NEXT: %[[FMA2:.+]] = llvm.intr.fma(%{{.}}, %{{.}}, %[[LINE2]])
	// CHECK-SAME: -> vector<64xf32>
	// CHECK-NEXT: %[[R3:.+]] = llvm.insertvalue %[[FMA2]], %[[R2]][2]

	// CHECK-NEXT: %[[LINE3:.+]] = llvm.extractvalue %[[V]][3] : !llvm.array<5 x vector<64xf32>>
	// CHECK-NEXT: %[[FMA3:.+]] = llvm.intr.fma(%{{.}}, %{{.}}, %[[LINE3]])
	// CHECK-SAME: -> vector<64xf32>
	// CHECK-NEXT: %[[R4:.+]] = llvm.insertvalue %[[FMA3]], %[[R3]][3]

	// CHECK-NEXT: %[[LINE4:.+]] = llvm.extractvalue %[[V]][4] : !llvm.array<5 x vector<64xf32>>
	// CHECK-NEXT: %[[FMA4:.+]] = llvm.intr.fma(%{{.}}, %{{.}}, %[[LINE4]])
	// CHECK-SAME: -> vector<64xf32>
	// CHECK-NEXT: %[[R5:.+]] = llvm.insertvalue %[[FMA4]], %[[R4]][4]