mlir/test/Interfaces/TilingInterface/tile-elementwise.mlir - llvm-project - Git at Google

 // RUN: mlir-opt --transform-interpreter --cse -split-input-file %s | FileCheck %s

 // Static binary add, tile both dims.

 func.func @tile_elementwise(%A: tensor<128x256xf32>, %B: tensor<128x256xf32>,
                             %C: tensor<128x256xf32>) -> tensor<128x256xf32> {
   %r = linalg.elementwise kind=#linalg.elementwise_kind<add>
       ins(%A, %B : tensor<128x256xf32>, tensor<128x256xf32>)
       outs(%C : tensor<128x256xf32>) -> tensor<128x256xf32>
   return %r : tensor<128x256xf32>
 }

 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
     %op = transform.structured.match ops{["linalg.elementwise"]} in %root
       : (!transform.any_op) -> !transform.any_op
     %tiled, %loop0, %loop1 = transform.structured.tile_using_for %op tile_sizes [32, 64]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }

 // CHECK-LABEL: func.func @tile_elementwise(
 // CHECK-SAME:    %[[A:[a-zA-Z0-9]+]]: tensor<128x256xf32>
 // CHECK-SAME:    %[[B:[a-zA-Z0-9]+]]: tensor<128x256xf32>
 // CHECK-SAME:    %[[C:[a-zA-Z0-9]+]]: tensor<128x256xf32>
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[C32:.+]] = arith.constant 32 : index
 //  CHECK-DAG:   %[[C64:.+]] = arith.constant 64 : index
 //  CHECK-DAG:   %[[C128:.+]] = arith.constant 128 : index
 //  CHECK-DAG:   %[[C256:.+]] = arith.constant 256 : index
 //      CHECK:   scf.for %[[IV0:.+]] = %[[C0]] to %[[C128]] step %[[C32]]
 // CHECK-SAME:       iter_args(%[[INIT0:.+]] = %[[C]])
 //      CHECK:     scf.for %[[IV1:.+]] = %[[C0]] to %[[C256]] step %[[C64]]
 // CHECK-SAME:         iter_args(%[[INIT1:.+]] = %[[INIT0]])
 //  CHECK-DAG:       %[[AT:.+]] = tensor.extract_slice %[[A]][%[[IV0]], %[[IV1]]] [32, 64] [1, 1]
 //  CHECK-DAG:       %[[BT:.+]] = tensor.extract_slice %[[B]][%[[IV0]], %[[IV1]]] [32, 64] [1, 1]
 //  CHECK-DAG:       %[[CT:.+]] = tensor.extract_slice %[[INIT1]][%[[IV0]], %[[IV1]]] [32, 64] [1, 1]
 //      CHECK:       %[[TILED:.+]] = linalg.elementwise kind=#linalg.elementwise_kind<add>
 // CHECK-SAME:           ins(%[[AT]], %[[BT]] :
 // CHECK-SAME:           outs(%[[CT]] :
 //      CHECK:       %[[INS:.+]] = tensor.insert_slice %[[TILED]] into %[[INIT1]]
 // CHECK-SAME:           [%[[IV0]], %[[IV1]]] [32, 64] [1, 1]
 //      CHECK:       scf.yield %[[INS]]
 //      CHECK:     scf.yield

 // -----

 // Dynamic binary add.

 func.func @tile_elementwise_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
                                     %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %r = linalg.elementwise kind=#linalg.elementwise_kind<add>
       ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
       outs(%C : tensor<?x?xf32>) -> tensor<?x?xf32>
   return %r : tensor<?x?xf32>
 }

 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
     %op = transform.structured.match ops{["linalg.elementwise"]} in %root
       : (!transform.any_op) -> !transform.any_op
     %tiled, %loop0, %loop1 = transform.structured.tile_using_for %op tile_sizes [10, 20]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }

 //  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
 //  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)>
 // CHECK-LABEL: func.func @tile_elementwise_dynamic(
 // CHECK-SAME:    %[[A:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 // CHECK-SAME:    %[[B:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 // CHECK-SAME:    %[[C:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[A]], %[[C0]]
 //  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
 //  CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[A]], %[[C1]]
 //  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
 //  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
 //      CHECK:   scf.for %[[IV0:.+]] = %[[C0]] to %[[D0]] step %[[C10]]
 // CHECK-SAME:       iter_args(%[[INIT0:.+]] = %[[C]])
 //      CHECK:     scf.for %[[IV1:.+]] = %[[C0]] to %[[D1]] step %[[C20]]
 // CHECK-SAME:         iter_args(%[[INIT1:.+]] = %[[INIT0]])
 //      CHECK:       %[[TS0:.+]] = affine.min #[[$MAP0]](%[[IV0]])[%[[D0]]]
 //      CHECK:       %[[TS1:.+]] = affine.min #[[$MAP1]](%[[IV1]])[%[[D1]]]
 //  CHECK-DAG:       %[[AT:.+]] = tensor.extract_slice %[[A]][%[[IV0]], %[[IV1]]] [%[[TS0]], %[[TS1]]] [1, 1]
 //  CHECK-DAG:       %[[BT:.+]] = tensor.extract_slice %[[B]][%[[IV0]], %[[IV1]]] [%[[TS0]], %[[TS1]]] [1, 1]
 //  CHECK-DAG:       %[[CT:.+]] = tensor.extract_slice %[[INIT1]][%[[IV0]], %[[IV1]]] [%[[TS0]], %[[TS1]]] [1, 1]
 //      CHECK:       %[[TILED:.+]] = linalg.elementwise kind=#linalg.elementwise_kind<add>
 // CHECK-SAME:           ins(%[[AT]], %[[BT]] :
 // CHECK-SAME:           outs(%[[CT]] :
 //      CHECK:       tensor.insert_slice %[[TILED]] into %[[INIT1]]
 // CHECK-SAME:           [%[[IV0]], %[[IV1]]] [%[[TS0]], %[[TS1]]] [1, 1]

 // -----

 // Memref variant: no iter_args, uses memref.subview instead of tensor.extract_slice.

 func.func @tile_elementwise_memref(%A: memref<128x256xf32>,
                                    %B: memref<128x256xf32>) {
   linalg.elementwise kind=#linalg.elementwise_kind<negf>
       ins(%A : memref<128x256xf32>)
       outs(%B : memref<128x256xf32>)
   return
 }

 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
     %op = transform.structured.match ops{["linalg.elementwise"]} in %root
       : (!transform.any_op) -> !transform.any_op
     %tiled, %loop0, %loop1 = transform.structured.tile_using_for %op tile_sizes [32, 64]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }

 // CHECK-LABEL: func.func @tile_elementwise_memref(
 // CHECK-SAME:    %[[A:[a-zA-Z0-9]+]]: memref<128x256xf32>
 // CHECK-SAME:    %[[B:[a-zA-Z0-9]+]]: memref<128x256xf32>
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[C32:.+]] = arith.constant 32 : index
 //  CHECK-DAG:   %[[C64:.+]] = arith.constant 64 : index
 //  CHECK-DAG:   %[[C128:.+]] = arith.constant 128 : index
 //  CHECK-DAG:   %[[C256:.+]] = arith.constant 256 : index
 //      CHECK:   scf.for %[[IV0:.+]] = %[[C0]] to %[[C128]] step %[[C32]]
 //  CHECK-NOT:     iter_args
 //      CHECK:     scf.for %[[IV1:.+]] = %[[C0]] to %[[C256]] step %[[C64]]
 //  CHECK-DAG:       %[[AT:.+]] = memref.subview %[[A]][%[[IV0]], %[[IV1]]] [32, 64] [1, 1]
 //  CHECK-DAG:       %[[BT:.+]] = memref.subview %[[B]][%[[IV0]], %[[IV1]]] [32, 64] [1, 1]
 //      CHECK:       linalg.elementwise kind=#linalg.elementwise_kind<negf>
 // CHECK-SAME:           ins(%[[AT]] :
 // CHECK-SAME:           outs(%[[BT]] :

 // -----

 // Parallel tiling with scf.forall: produces parallel_insert_slice.

 func.func @tile_elementwise_forall(%A: tensor<128x256xf32>, %B: tensor<128x256xf32>,
                                    %C: tensor<128x256xf32>) -> tensor<128x256xf32> {
   %r = linalg.elementwise kind=#linalg.elementwise_kind<add>
       ins(%A, %B : tensor<128x256xf32>, tensor<128x256xf32>)
       outs(%C : tensor<128x256xf32>) -> tensor<128x256xf32>
   return %r : tensor<128x256xf32>
 }

 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
     %op = transform.structured.match ops{["linalg.elementwise"]} in %root
       : (!transform.any_op) -> !transform.any_op
     %tiled, %forall = transform.structured.tile_using_forall %op tile_sizes [32, 64]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
     transform.yield
   }
 }

 //  CHECK-DAG: #[[$MAPX:.+]] = affine_map<(d0) -> (d0 * 32)>
 //  CHECK-DAG: #[[$MAPY:.+]] = affine_map<(d0) -> (d0 * 64)>
 // CHECK-LABEL: func.func @tile_elementwise_forall(
 // CHECK-SAME:    %[[A:[a-zA-Z0-9]+]]: tensor<128x256xf32>
 // CHECK-SAME:    %[[B:[a-zA-Z0-9]+]]: tensor<128x256xf32>
 // CHECK-SAME:    %[[C:[a-zA-Z0-9]+]]: tensor<128x256xf32>
 //      CHECK:   %[[RESULT:.+]] = scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (4, 4)
 // CHECK-SAME:       shared_outs(%[[INIT:.+]] = %[[C]])
 //  CHECK-DAG:     %[[OFF0:.+]] = affine.apply #[[$MAPX]](%[[IV0]])
 //  CHECK-DAG:     %[[OFF1:.+]] = affine.apply #[[$MAPY]](%[[IV1]])
 //  CHECK-DAG:     %[[AT:.+]] = tensor.extract_slice %[[A]][%[[OFF0]], %[[OFF1]]] [32, 64] [1, 1]
 //  CHECK-DAG:     %[[BT:.+]] = tensor.extract_slice %[[B]][%[[OFF0]], %[[OFF1]]] [32, 64] [1, 1]
 //  CHECK-DAG:     %[[CT:.+]] = tensor.extract_slice %[[INIT]][%[[OFF0]], %[[OFF1]]] [32, 64] [1, 1]
 //      CHECK:     %[[TILED:.+]] = linalg.elementwise kind=#linalg.elementwise_kind<add>
 // CHECK-SAME:         ins(%[[AT]], %[[BT]] :
 // CHECK-SAME:         outs(%[[CT]] :
 //      CHECK:     scf.forall.in_parallel
 //      CHECK:       tensor.parallel_insert_slice %[[TILED]] into %[[INIT]]
 // CHECK-SAME:           [%[[OFF0]], %[[OFF1]]] [32, 64] [1, 1]
 //      CHECK:   return %[[RESULT]]

 // -----

 // Broadcast: non-identity indexing map.  The input has rank 1 (only d1) so
 // tiling along d0 does not slice the input at all.

 #map_in  = affine_map<(d0, d1) -> (d1)>
 #map_out = affine_map<(d0, d1) -> (d0, d1)>

 func.func @tile_elementwise_broadcast(%A: tensor<256xf32>,
                                       %B: tensor<128x256xf32>) -> tensor<128x256xf32> {
   %r = linalg.elementwise kind=#linalg.elementwise_kind<exp>
       indexing_maps = [#map_in, #map_out]
       ins(%A : tensor<256xf32>)
       outs(%B : tensor<128x256xf32>) -> tensor<128x256xf32>
   return %r : tensor<128x256xf32>
 }

 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
     %op = transform.structured.match ops{["linalg.elementwise"]} in %root
       : (!transform.any_op) -> !transform.any_op
     %tiled, %loop0, %loop1 = transform.structured.tile_using_for %op tile_sizes [32, 64]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }

 // CHECK-LABEL: func.func @tile_elementwise_broadcast(
 // CHECK-SAME:    %[[A:[a-zA-Z0-9]+]]: tensor<256xf32>
 // CHECK-SAME:    %[[B:[a-zA-Z0-9]+]]: tensor<128x256xf32>
 //      CHECK:   scf.for %[[IV0:[a-zA-Z0-9]+]] =
 // CHECK-SAME:       iter_args(%[[INIT0:[a-zA-Z0-9]+]] = %[[B]])
 //      CHECK:     scf.for %[[IV1:[a-zA-Z0-9]+]] =
 // CHECK-SAME:         iter_args(%[[INIT1:[a-zA-Z0-9]+]] = %[[INIT0]])
 // Input is 1-D: sliced only along d1, not d0.
 //      CHECK:       %[[AT:.+]] = tensor.extract_slice %[[A]][%[[IV1]]] [64] [1]
 //      CHECK:       %[[BT:.+]] = tensor.extract_slice %[[INIT1]][%[[IV0]], %[[IV1]]] [32, 64] [1, 1]
 //      CHECK:       %[[TILED:.+]] = linalg.elementwise kind=#linalg.elementwise_kind<exp>
 // CHECK-SAME:           ins(%[[AT]] : tensor<64xf32>)
 // CHECK-SAME:           outs(%[[BT]] : tensor<32x64xf32>)
 //      CHECK:       tensor.insert_slice %[[TILED]] into %[[INIT1]]
 // CHECK-SAME:           [%[[IV0]], %[[IV1]]] [32, 64] [1, 1]

 // -----

 // Tile-and-fuse: exp producer is fused into the tiled add consumer.

 func.func @tile_and_fuse_elementwise(%A: tensor<128x256xf32>,
                                      %B: tensor<128x256xf32>) -> tensor<128x256xf32> {
   %empty0 = tensor.empty() : tensor<128x256xf32>
   %exp = linalg.elementwise kind=#linalg.elementwise_kind<exp>
       ins(%A : tensor<128x256xf32>)
       outs(%empty0 : tensor<128x256xf32>) -> tensor<128x256xf32>
   %empty1 = tensor.empty() : tensor<128x256xf32>
   %r = linalg.elementwise kind=#linalg.elementwise_kind<add>
       ins(%exp, %B : tensor<128x256xf32>, tensor<128x256xf32>)
       outs(%empty1 : tensor<128x256xf32>) -> tensor<128x256xf32>
   return %r : tensor<128x256xf32>
 }

 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
     %add = transform.structured.match ops{["linalg.elementwise"]}
         attributes{kind = #linalg.elementwise_kind<add>} in %root
       : (!transform.any_op) -> !transform.any_op
     %tiled, %loop0, %loop1 = transform.structured.fuse %add tile_sizes [32, 64]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }

 // CHECK-LABEL: func.func @tile_and_fuse_elementwise(
 // CHECK-SAME:    %[[A:[a-zA-Z0-9]+]]: tensor<128x256xf32>
 // CHECK-SAME:    %[[B:[a-zA-Z0-9]+]]: tensor<128x256xf32>
 //      CHECK:   %[[EMPTY:.+]] = tensor.empty()
 //      CHECK:   scf.for %[[IV0:[a-zA-Z0-9]+]] =
 // CHECK-SAME:       iter_args(%[[INIT0:[a-zA-Z0-9]+]] = %[[EMPTY]])
 //      CHECK:     scf.for %[[IV1:[a-zA-Z0-9]+]] =
 // CHECK-SAME:         iter_args(%[[INIT1:[a-zA-Z0-9]+]] = %[[INIT0]])
 // exp tile is generated from the original input, not the full exp result.
 //  CHECK-DAG:       %[[AT:.+]] = tensor.extract_slice %[[A]][%[[IV0]], %[[IV1]]] [32, 64] [1, 1]
 //      CHECK:       %[[EXP_TILE:.+]] = linalg.elementwise kind=#linalg.elementwise_kind<exp>
 // CHECK-SAME:           ins(%[[AT]] :
 //  CHECK-DAG:       %[[BT:.+]] = tensor.extract_slice %[[B]][%[[IV0]], %[[IV1]]] [32, 64] [1, 1]
 //      CHECK:       %[[ADD_TILE:.+]] = linalg.elementwise kind=#linalg.elementwise_kind<add>
 // CHECK-SAME:           ins(%[[EXP_TILE]], %[[BT]] :
 //      CHECK:       tensor.insert_slice %[[ADD_TILE]] into %[[INIT1]]
 // CHECK-SAME:           [%[[IV0]], %[[IV1]]] [32, 64] [1, 1]
	// RUN: mlir-opt --transform-interpreter --cse -split-input-file %s \| FileCheck %s

	// Static binary add, tile both dims.

	func.func @tile_elementwise(%A: tensor<128x256xf32>, %B: tensor<128x256xf32>,
	%C: tensor<128x256xf32>) -> tensor<128x256xf32> {
	%r = linalg.elementwise kind=#linalg.elementwise_kind<add>
	ins(%A, %B : tensor<128x256xf32>, tensor<128x256xf32>)
	outs(%C : tensor<128x256xf32>) -> tensor<128x256xf32>
	return %r : tensor<128x256xf32>
	}

	module attributes {transform.with_named_sequence} {
	transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
	%op = transform.structured.match ops{["linalg.elementwise"]} in %root
	: (!transform.any_op) -> !transform.any_op
	%tiled, %loop0, %loop1 = transform.structured.tile_using_for %op tile_sizes [32, 64]
	: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
	transform.yield
	}
	}

	// CHECK-LABEL: func.func @tile_elementwise(
	// CHECK-SAME: %[[A:[a-zA-Z0-9]+]]: tensor<128x256xf32>
	// CHECK-SAME: %[[B:[a-zA-Z0-9]+]]: tensor<128x256xf32>
	// CHECK-SAME: %[[C:[a-zA-Z0-9]+]]: tensor<128x256xf32>
	// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
	// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index
	// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index
	// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index
	// CHECK-DAG: %[[C256:.+]] = arith.constant 256 : index
	// CHECK: scf.for %[[IV0:.+]] = %[[C0]] to %[[C128]] step %[[C32]]
	// CHECK-SAME: iter_args(%[[INIT0:.+]] = %[[C]])
	// CHECK: scf.for %[[IV1:.+]] = %[[C0]] to %[[C256]] step %[[C64]]
	// CHECK-SAME: iter_args(%[[INIT1:.+]] = %[[INIT0]])
	// CHECK-DAG: %[[AT:.+]] = tensor.extract_slice %[[A]][%[[IV0]], %[[IV1]]] [32, 64] [1, 1]
	// CHECK-DAG: %[[BT:.+]] = tensor.extract_slice %[[B]][%[[IV0]], %[[IV1]]] [32, 64] [1, 1]
	// CHECK-DAG: %[[CT:.+]] = tensor.extract_slice %[[INIT1]][%[[IV0]], %[[IV1]]] [32, 64] [1, 1]
	// CHECK: %[[TILED:.+]] = linalg.elementwise kind=#linalg.elementwise_kind<add>
	// CHECK-SAME: ins(%[[AT]], %[[BT]] :
	// CHECK-SAME: outs(%[[CT]] :
	// CHECK: %[[INS:.+]] = tensor.insert_slice %[[TILED]] into %[[INIT1]]
	// CHECK-SAME: [%[[IV0]], %[[IV1]]] [32, 64] [1, 1]
	// CHECK: scf.yield %[[INS]]
	// CHECK: scf.yield

	// -----

	// Dynamic binary add.

	func.func @tile_elementwise_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
	%C: tensor<?x?xf32>) -> tensor<?x?xf32> {
	%r = linalg.elementwise kind=#linalg.elementwise_kind<add>
	ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
	outs(%C : tensor<?x?xf32>) -> tensor<?x?xf32>
	return %r : tensor<?x?xf32>
	}

	module attributes {transform.with_named_sequence} {
	transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
	%op = transform.structured.match ops{["linalg.elementwise"]} in %root
	: (!transform.any_op) -> !transform.any_op
	%tiled, %loop0, %loop1 = transform.structured.tile_using_for %op tile_sizes [10, 20]
	: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
	transform.yield
	}
	}

	// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
	// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)>
	// CHECK-LABEL: func.func @tile_elementwise_dynamic(
	// CHECK-SAME: %[[A:[a-zA-Z0-9]+]]: tensor<?x?xf32>
	// CHECK-SAME: %[[B:[a-zA-Z0-9]+]]: tensor<?x?xf32>
	// CHECK-SAME: %[[C:[a-zA-Z0-9]+]]: tensor<?x?xf32>
	// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
	// CHECK-DAG: %[[D0:.+]] = tensor.dim %[[A]], %[[C0]]
	// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
	// CHECK-DAG: %[[D1:.+]] = tensor.dim %[[A]], %[[C1]]
	// CHECK-DAG: %[[C10:.+]] = arith.constant 10 : index
	// CHECK-DAG: %[[C20:.+]] = arith.constant 20 : index
	// CHECK: scf.for %[[IV0:.+]] = %[[C0]] to %[[D0]] step %[[C10]]
	// CHECK-SAME: iter_args(%[[INIT0:.+]] = %[[C]])
	// CHECK: scf.for %[[IV1:.+]] = %[[C0]] to %[[D1]] step %[[C20]]
	// CHECK-SAME: iter_args(%[[INIT1:.+]] = %[[INIT0]])
	// CHECK: %[[TS0:.+]] = affine.min #[[$MAP0]](%[[IV0]])[%[[D0]]]
	// CHECK: %[[TS1:.+]] = affine.min #[[$MAP1]](%[[IV1]])[%[[D1]]]
	// CHECK-DAG: %[[AT:.+]] = tensor.extract_slice %[[A]][%[[IV0]], %[[IV1]]] [%[[TS0]], %[[TS1]]] [1, 1]
	// CHECK-DAG: %[[BT:.+]] = tensor.extract_slice %[[B]][%[[IV0]], %[[IV1]]] [%[[TS0]], %[[TS1]]] [1, 1]
	// CHECK-DAG: %[[CT:.+]] = tensor.extract_slice %[[INIT1]][%[[IV0]], %[[IV1]]] [%[[TS0]], %[[TS1]]] [1, 1]
	// CHECK: %[[TILED:.+]] = linalg.elementwise kind=#linalg.elementwise_kind<add>
	// CHECK-SAME: ins(%[[AT]], %[[BT]] :
	// CHECK-SAME: outs(%[[CT]] :
	// CHECK: tensor.insert_slice %[[TILED]] into %[[INIT1]]
	// CHECK-SAME: [%[[IV0]], %[[IV1]]] [%[[TS0]], %[[TS1]]] [1, 1]

	// -----

	// Memref variant: no iter_args, uses memref.subview instead of tensor.extract_slice.

	func.func @tile_elementwise_memref(%A: memref<128x256xf32>,
	%B: memref<128x256xf32>) {
	linalg.elementwise kind=#linalg.elementwise_kind<negf>
	ins(%A : memref<128x256xf32>)
	outs(%B : memref<128x256xf32>)
	return
	}

	module attributes {transform.with_named_sequence} {
	transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
	%op = transform.structured.match ops{["linalg.elementwise"]} in %root
	: (!transform.any_op) -> !transform.any_op
	%tiled, %loop0, %loop1 = transform.structured.tile_using_for %op tile_sizes [32, 64]
	: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
	transform.yield
	}
	}

	// CHECK-LABEL: func.func @tile_elementwise_memref(
	// CHECK-SAME: %[[A:[a-zA-Z0-9]+]]: memref<128x256xf32>
	// CHECK-SAME: %[[B:[a-zA-Z0-9]+]]: memref<128x256xf32>
	// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
	// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index
	// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index
	// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index
	// CHECK-DAG: %[[C256:.+]] = arith.constant 256 : index
	// CHECK: scf.for %[[IV0:.+]] = %[[C0]] to %[[C128]] step %[[C32]]
	// CHECK-NOT: iter_args
	// CHECK: scf.for %[[IV1:.+]] = %[[C0]] to %[[C256]] step %[[C64]]
	// CHECK-DAG: %[[AT:.+]] = memref.subview %[[A]][%[[IV0]], %[[IV1]]] [32, 64] [1, 1]
	// CHECK-DAG: %[[BT:.+]] = memref.subview %[[B]][%[[IV0]], %[[IV1]]] [32, 64] [1, 1]
	// CHECK: linalg.elementwise kind=#linalg.elementwise_kind<negf>
	// CHECK-SAME: ins(%[[AT]] :
	// CHECK-SAME: outs(%[[BT]] :

	// -----

	// Parallel tiling with scf.forall: produces parallel_insert_slice.

	func.func @tile_elementwise_forall(%A: tensor<128x256xf32>, %B: tensor<128x256xf32>,
	%C: tensor<128x256xf32>) -> tensor<128x256xf32> {
	%r = linalg.elementwise kind=#linalg.elementwise_kind<add>
	ins(%A, %B : tensor<128x256xf32>, tensor<128x256xf32>)
	outs(%C : tensor<128x256xf32>) -> tensor<128x256xf32>
	return %r : tensor<128x256xf32>
	}

	module attributes {transform.with_named_sequence} {
	transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
	%op = transform.structured.match ops{["linalg.elementwise"]} in %root
	: (!transform.any_op) -> !transform.any_op
	%tiled, %forall = transform.structured.tile_using_forall %op tile_sizes [32, 64]
	: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
	transform.yield
	}
	}

	// CHECK-DAG: #[[$MAPX:.+]] = affine_map<(d0) -> (d0 * 32)>
	// CHECK-DAG: #[[$MAPY:.+]] = affine_map<(d0) -> (d0 * 64)>
	// CHECK-LABEL: func.func @tile_elementwise_forall(
	// CHECK-SAME: %[[A:[a-zA-Z0-9]+]]: tensor<128x256xf32>
	// CHECK-SAME: %[[B:[a-zA-Z0-9]+]]: tensor<128x256xf32>
	// CHECK-SAME: %[[C:[a-zA-Z0-9]+]]: tensor<128x256xf32>
	// CHECK: %[[RESULT:.+]] = scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (4, 4)
	// CHECK-SAME: shared_outs(%[[INIT:.+]] = %[[C]])
	// CHECK-DAG: %[[OFF0:.+]] = affine.apply #[[$MAPX]](%[[IV0]])
	// CHECK-DAG: %[[OFF1:.+]] = affine.apply #[[$MAPY]](%[[IV1]])
	// CHECK-DAG: %[[AT:.+]] = tensor.extract_slice %[[A]][%[[OFF0]], %[[OFF1]]] [32, 64] [1, 1]
	// CHECK-DAG: %[[BT:.+]] = tensor.extract_slice %[[B]][%[[OFF0]], %[[OFF1]]] [32, 64] [1, 1]
	// CHECK-DAG: %[[CT:.+]] = tensor.extract_slice %[[INIT]][%[[OFF0]], %[[OFF1]]] [32, 64] [1, 1]
	// CHECK: %[[TILED:.+]] = linalg.elementwise kind=#linalg.elementwise_kind<add>
	// CHECK-SAME: ins(%[[AT]], %[[BT]] :
	// CHECK-SAME: outs(%[[CT]] :
	// CHECK: scf.forall.in_parallel
	// CHECK: tensor.parallel_insert_slice %[[TILED]] into %[[INIT]]
	// CHECK-SAME: [%[[OFF0]], %[[OFF1]]] [32, 64] [1, 1]
	// CHECK: return %[[RESULT]]

	// -----

	// Broadcast: non-identity indexing map. The input has rank 1 (only d1) so
	// tiling along d0 does not slice the input at all.

	#map_in = affine_map<(d0, d1) -> (d1)>
	#map_out = affine_map<(d0, d1) -> (d0, d1)>

	func.func @tile_elementwise_broadcast(%A: tensor<256xf32>,
	%B: tensor<128x256xf32>) -> tensor<128x256xf32> {
	%r = linalg.elementwise kind=#linalg.elementwise_kind<exp>
	indexing_maps = [#map_in, #map_out]
	ins(%A : tensor<256xf32>)
	outs(%B : tensor<128x256xf32>) -> tensor<128x256xf32>
	return %r : tensor<128x256xf32>
	}

	module attributes {transform.with_named_sequence} {
	transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
	%op = transform.structured.match ops{["linalg.elementwise"]} in %root
	: (!transform.any_op) -> !transform.any_op
	%tiled, %loop0, %loop1 = transform.structured.tile_using_for %op tile_sizes [32, 64]
	: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
	transform.yield
	}
	}

	// CHECK-LABEL: func.func @tile_elementwise_broadcast(
	// CHECK-SAME: %[[A:[a-zA-Z0-9]+]]: tensor<256xf32>
	// CHECK-SAME: %[[B:[a-zA-Z0-9]+]]: tensor<128x256xf32>
	// CHECK: scf.for %[[IV0:[a-zA-Z0-9]+]] =
	// CHECK-SAME: iter_args(%[[INIT0:[a-zA-Z0-9]+]] = %[[B]])
	// CHECK: scf.for %[[IV1:[a-zA-Z0-9]+]] =
	// CHECK-SAME: iter_args(%[[INIT1:[a-zA-Z0-9]+]] = %[[INIT0]])
	// Input is 1-D: sliced only along d1, not d0.
	// CHECK: %[[AT:.+]] = tensor.extract_slice %[[A]][%[[IV1]]] [64] [1]
	// CHECK: %[[BT:.+]] = tensor.extract_slice %[[INIT1]][%[[IV0]], %[[IV1]]] [32, 64] [1, 1]
	// CHECK: %[[TILED:.+]] = linalg.elementwise kind=#linalg.elementwise_kind<exp>
	// CHECK-SAME: ins(%[[AT]] : tensor<64xf32>)
	// CHECK-SAME: outs(%[[BT]] : tensor<32x64xf32>)
	// CHECK: tensor.insert_slice %[[TILED]] into %[[INIT1]]
	// CHECK-SAME: [%[[IV0]], %[[IV1]]] [32, 64] [1, 1]

	// -----

	// Tile-and-fuse: exp producer is fused into the tiled add consumer.

	func.func @tile_and_fuse_elementwise(%A: tensor<128x256xf32>,
	%B: tensor<128x256xf32>) -> tensor<128x256xf32> {
	%empty0 = tensor.empty() : tensor<128x256xf32>
	%exp = linalg.elementwise kind=#linalg.elementwise_kind<exp>
	ins(%A : tensor<128x256xf32>)
	outs(%empty0 : tensor<128x256xf32>) -> tensor<128x256xf32>
	%empty1 = tensor.empty() : tensor<128x256xf32>
	%r = linalg.elementwise kind=#linalg.elementwise_kind<add>
	ins(%exp, %B : tensor<128x256xf32>, tensor<128x256xf32>)
	outs(%empty1 : tensor<128x256xf32>) -> tensor<128x256xf32>
	return %r : tensor<128x256xf32>
	}

	module attributes {transform.with_named_sequence} {
	transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
	%add = transform.structured.match ops{["linalg.elementwise"]}
	attributes{kind = #linalg.elementwise_kind<add>} in %root
	: (!transform.any_op) -> !transform.any_op
	%tiled, %loop0, %loop1 = transform.structured.fuse %add tile_sizes [32, 64]
	: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
	transform.yield
	}
	}

	// CHECK-LABEL: func.func @tile_and_fuse_elementwise(
	// CHECK-SAME: %[[A:[a-zA-Z0-9]+]]: tensor<128x256xf32>
	// CHECK-SAME: %[[B:[a-zA-Z0-9]+]]: tensor<128x256xf32>
	// CHECK: %[[EMPTY:.+]] = tensor.empty()
	// CHECK: scf.for %[[IV0:[a-zA-Z0-9]+]] =
	// CHECK-SAME: iter_args(%[[INIT0:[a-zA-Z0-9]+]] = %[[EMPTY]])
	// CHECK: scf.for %[[IV1:[a-zA-Z0-9]+]] =
	// CHECK-SAME: iter_args(%[[INIT1:[a-zA-Z0-9]+]] = %[[INIT0]])
	// exp tile is generated from the original input, not the full exp result.
	// CHECK-DAG: %[[AT:.+]] = tensor.extract_slice %[[A]][%[[IV0]], %[[IV1]]] [32, 64] [1, 1]
	// CHECK: %[[EXP_TILE:.+]] = linalg.elementwise kind=#linalg.elementwise_kind<exp>
	// CHECK-SAME: ins(%[[AT]] :
	// CHECK-DAG: %[[BT:.+]] = tensor.extract_slice %[[B]][%[[IV0]], %[[IV1]]] [32, 64] [1, 1]
	// CHECK: %[[ADD_TILE:.+]] = linalg.elementwise kind=#linalg.elementwise_kind<add>
	// CHECK-SAME: ins(%[[EXP_TILE]], %[[BT]] :
	// CHECK: tensor.insert_slice %[[ADD_TILE]] into %[[INIT1]]
	// CHECK-SAME: [%[[IV0]], %[[IV1]]] [32, 64] [1, 1]