test/Dialect/Linalg/one-shot-bufferize.mlir - llvm-project/mlir - Git at Google

 // RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries" -canonicalize -buffer-loop-hoisting -drop-equivalent-buffer-results -split-input-file | FileCheck %s

 // Run fuzzer with different seeds.
 // RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23 bufferize-function-boundaries" -split-input-file -o /dev/null
 // RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=59 bufferize-function-boundaries" -split-input-file -o /dev/null
 // RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=91 bufferize-function-boundaries" -split-input-file -o /dev/null

 // Test bufferization using memref types that have no layout map.
 // RUN: mlir-opt %s -one-shot-bufferize="unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries" -drop-equivalent-buffer-results -split-input-file | FileCheck %s --check-prefix=CHECK-NO-LAYOUT-MAP

 // TODO: Some test cases from this file should be moved to other dialects.

 // CHECK-LABEL: func @fill_inplace(
 //  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>
 // CHECK-NO-LAYOUT-MAP-LABEL: func @fill_inplace(%{{.*}}: memref<?xf32>) {
 func.func @fill_inplace(
     %A : tensor<?xf32> {bufferization.writable = true})
   -> tensor<?xf32>
 {
   //     CHECK: %[[F0:.*]] = arith.constant 0.000000e+00 : f32
   %f0 = arith.constant 0.0 : f32

   /// Inplaceable, no alloc
   // CHECK-NOT: alloc
   //     CHECK: linalg.fill ins(%[[F0]] : f32) outs(%[[A]] : memref<?xf32, strided<[?], offset: ?>>)
   %r = linalg.fill ins(%f0 : f32) outs(%A : tensor<?xf32>) -> tensor<?xf32>

   //     CHECK: return
   // CHECK-NOT: tensor
   return %r: tensor<?xf32>
 }

 // -----

 /// No bufferization.writable flag, must allocate.
 // CHECK-LABEL: func @not_inplace(
 //  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>) -> memref<?xf32> {
 // CHECK-NO-LAYOUT-MAP-LABEL: func @not_inplace(%{{.*}}: memref<?xf32>) -> memref<?xf32>
 func.func @not_inplace(
     %A : tensor<?xf32> {bufferization.writable = false})
   -> tensor<?xf32>
 {
   //     CHECK: %[[F0:.*]] = arith.constant 0.000000e+00 : f32
   %f0 = arith.constant 0.0 : f32

   //     CHECK: %[[D0:.*]] = memref.dim %[[A]], {{.*}} : memref<?xf32, strided<[?], offset: ?>>
   //     CHECK: %[[ALLOC:.*]] = memref.alloc(%[[D0]]) {alignment = 64 : i64} : memref<?xf32>
   //     CHECK: linalg.fill ins(%[[F0]] : f32) outs(%[[ALLOC]] : memref<?xf32>)
   %r = linalg.fill ins(%f0 : f32) outs(%A : tensor<?xf32>) -> tensor<?xf32>

   // CHECK-NOT: dealloc
   //     CHECK: return %[[ALLOC]] : memref<?xf32>
   return %r: tensor<?xf32>
 }

 // -----


 // CHECK-LABEL: func @not_inplace
 //  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?x?xf32, strided<[?, ?], offset: ?>>) {
 // CHECK-NO-LAYOUT-MAP-LABEL: func @not_inplace(%{{.*}}: memref<?x?xf32>) {
 func.func @not_inplace(
     %A : tensor<?x?xf32> {bufferization.writable = true})
   -> tensor<?x?xf32>
 {
   %f0 = arith.constant 0.0 : f32

   /// Cross-op multiple uses of %A, the first op which has interfering reads must alloc.
   //       CHECK: %[[ALLOC:.*]] = memref.alloc
   //       CHECK: linalg.fill ins({{.*}}{{.*}}outs(%[[ALLOC]]
   %f = linalg.fill ins(%f0 : f32) outs(%A : tensor<?x?xf32>) -> tensor<?x?xf32>

   /// The second op has no interfering reads and can reuse.
   //   CHECK-NOT: alloc
   //       CHECK: linalg.matmul ins(%[[ALLOC]], %[[ALLOC]]{{.*}}) outs(%[[A]]
   %r = linalg.matmul  ins(%f, %f: tensor<?x?xf32>, tensor<?x?xf32>)
                      outs(%A: tensor<?x?xf32>)
     -> tensor<?x?xf32>

   //     CHECK: return
   // CHECK-NOT: tensor
   return %r: tensor<?x?xf32>
 }

 // -----

 // CHECK-LABEL: func @not_inplace
 func.func @not_inplace(
     %A : tensor<?x?xf32> {bufferization.writable = true}) -> tensor<?x?xf32> {
   /// Within op multiple uses of %A, must alloc.
   // CHECK: alloc
   %r = linalg.matmul  ins(%A, %A: tensor<?x?xf32>, tensor<?x?xf32>)
                      outs(%A: tensor<?x?xf32>)
     -> tensor<?x?xf32>
   // CHECK-NOT: dealloc
   return %r: tensor<?x?xf32>
 }
 // -----

 // CHECK-LABEL: func @vec_inplace
 func.func @vec_inplace(
     %A : tensor<?xf32> {bufferization.writable = true}, %vec : vector<4xf32>)
   -> tensor<?xf32>
 {
   %c0 = arith.constant 0 : index

   // CHECK-NOT: alloc
   %r = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor<?xf32>

   //     CHECK: return
   // CHECK-NOT: tensor
   return %r: tensor<?xf32>
 }

 // -----

 // CHECK-LABEL: func @vec_not_inplace
 //  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>
 func.func @vec_not_inplace(
     %A : tensor<?xf32> {bufferization.writable = true}, %vec : vector<4xf32>)
   -> (tensor<?xf32>, tensor<?xf32>)
 {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index

   /// Cross-op multiple uses of %A, the first vector.transfer which has interfering reads must alloc.
   //      CHECK: %[[ALLOC:.*]] = memref.alloc
   //      CHECK: memref.copy {{.*}}, %[[ALLOC]]
   // CHECK-NEXT: vector.transfer_write {{.*}}, %[[ALLOC]]
   %r0 = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor<?xf32>

   /// The second vector.transfer has no interfering reads and can reuse the buffer.
   //  CHECK-NOT: alloc
   // CHECK-NEXT: vector.transfer_write {{.*}}, %[[A]]
   %r1 = vector.transfer_write %vec, %A[%c1] : vector<4xf32>, tensor<?xf32>

   //     CHECK: return
   // CHECK-NOT: tensor
   return %r0, %r1: tensor<?xf32>, tensor<?xf32>
 }

 // -----

 //      CHECK: func @matmul(
 // CHECK-SAME:   %[[A:[0-9a-zA-Z]*]]: memref<128x256xf32>
 // CHECK-SAME:   %[[B:[0-9a-zA-Z]*]]: memref<256x192xf32>
 // CHECK-SAME:   %[[C:[0-9a-zA-Z]*]]: memref<128x192xf32>
 func.func @matmul(
     %A: tensor<128x256xf32> {bufferization.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, bufferization.writable = false},
     %B: tensor<256x192xf32> {bufferization.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, bufferization.writable = false},
     %C: tensor<128x192xf32> {bufferization.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, bufferization.writable = true})
   -> tensor<128x192xf32> {
   %c0 = arith.constant 0 : index
   %c256 = arith.constant 256 : index
   %c32 = arith.constant 32 : index
   %cst = arith.constant 0.000000e+00 : f32
   %c128 = arith.constant 128 : index
   %c192 = arith.constant 192 : index
   %c8 = arith.constant 8 : index
   %c16 = arith.constant 16 : index

   // Hoisted alloc.
   // CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<8x16xf32>

   // CHECK: scf.for %[[I:.*]] =
   %0 = scf.for %arg3 = %c0 to %c128 step %c8 iter_args(%arg4 = %C) -> (tensor<128x192xf32>) {
     %1 = tensor.extract_slice %A[%arg3, 0] [8, 256] [1, 1] :
       tensor<128x256xf32> to tensor<8x256xf32>

     // CHECK: scf.for %[[J:.*]] =
     %2 = scf.for %arg5 = %c0 to %c192 step %c16 iter_args(%arg6 = %arg4) -> (tensor<128x192xf32>) {
       %3 = tensor.extract_slice %B[0, %arg5] [256, 16] [1, 1] :
         tensor<256x192xf32> to tensor<256x16xf32>

       // Insert an artificial out-of-place buffer by extracting from %C instead
       // of %arg6.
       %4 = tensor.extract_slice %C[%arg3, %arg5] [8, 16] [1, 1] :
         tensor<128x192xf32> to tensor<8x16xf32>

       // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[ALLOC]]
       %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<8x16xf32>) -> tensor<8x16xf32>

       // CHECK: scf.for %[[K:.*]] =
       %6 = scf.for %arg7 = %c0 to %c256 step %c32 iter_args(%arg8 = %5) -> (tensor<8x16xf32>) {
         %8 = tensor.extract_slice %1[0, %arg7] [8, 32] [1, 1] :
           tensor<8x256xf32> to tensor<8x32xf32>
         %9 = tensor.extract_slice %3[%arg7, 0] [32, 16] [1, 1] :
           tensor<256x16xf32> to tensor<32x16xf32>

         // linalg.matmul is inplace as well as the enclosing scf.for.
         // CHECK: linalg.matmul ins({{.*}} outs(%[[ALLOC]]
         %10 = linalg.matmul ins(%8, %9 : tensor<8x32xf32>, tensor<32x16xf32>)
                            outs(%arg8 : tensor<8x16xf32>)
           -> tensor<8x16xf32>
         scf.yield %10 : tensor<8x16xf32>
       }

       // insert_slice is inplace but its source comes from an equivalent buffer
       // that is not in place. So we must insert a copy of the small buffer into
       // the bigger buffer.
       // CHECK: %[[T:.*]] = memref.subview %[[C]][%[[I]], %[[J]]] [8, 16] [1, 1]
       // CHECK: memref.copy %[[ALLOC]], %[[T]]
       %7 = tensor.insert_slice %6 into %arg6[%arg3, %arg5] [8, 16] [1, 1] :
         tensor<8x16xf32> into tensor<128x192xf32>

       scf.yield %7 : tensor<128x192xf32>
     }
     scf.yield %2 : tensor<128x192xf32>
   }

   return %0 : tensor<128x192xf32>
 }

 // -----

 /// This test just checks the produced IR is valid and does not have dominance
 /// errors in the def-use chains.

 // CHECK-LABEL: func @dominance_violation_bug_1
 func.func @dominance_violation_bug_1(
     %A : tensor<?x?xf32> {bufferization.writable = false},
     %idx : index)
   -> tensor<?x?xf32>
 {
   %f0 = arith.constant 0.0 : f32

   %sA = tensor.extract_slice %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
   %ssA = tensor.extract_slice %sA[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
   %FA = linalg.fill ins(%f0 : f32) outs(%ssA : tensor<4x4xf32>) -> tensor<4x4xf32>
   %rsA = tensor.insert_slice %FA into %sA[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
   %rA = tensor.insert_slice %rsA into %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>

   return %rA : tensor<?x?xf32>
 }

 // -----

 func.func @gather_like(
     %arg0 : tensor<?x?xf32> {bufferization.writable = false},
     %arg1 : tensor<?xi32> {bufferization.writable = false},
     %arg2 : tensor<?x?xf32> {bufferization.writable = true})
   -> tensor<?x?xf32>
 {
   %0 = linalg.generic {
       indexing_maps = [affine_map<(d0, d1) -> (d0)>,
                        affine_map<(d0, d1) -> (d0, d1)>],
       iterator_types = ["parallel", "parallel"]}
       ins(%arg1 : tensor<?xi32>) outs(%arg2 : tensor<?x?xf32>) {
       ^bb0(%arg3: i32, %arg4 : f32):
         %iv1 = linalg.index 1 : index
         %1 = arith.index_cast %arg3: i32 to index
         %2 = tensor.extract %arg0[%1, %iv1] : tensor<?x?xf32>
         linalg.yield %2 : f32
       } -> tensor<?x?xf32>
   return %0 : tensor<?x?xf32>
 }
 // CHECK-LABEL: func @gather_like(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: memref<?x?xf32,
 //  CHECK-SAME:     %[[ARG1:.+]]: memref<?xi32
 //  CHECK-SAME:     %[[ARG2:.+]]: memref<?x?xf32
 //  CHECK-SAME:   ) {
 //       CHECK:   linalg.generic
 //  CHECK-SAME:       ins(%[[ARG1]] :
 //  CHECK-SAME:       outs(%[[ARG2]] :
 //       CHECK:     %[[YIELD:.+]] = memref.load %[[ARG0]]
 //       CHECK:     linalg.yield %[[YIELD]]

 // -----

 // CHECK-LABEL: func @linalg_op_bufferizes_inplace_with_input
 //  CHECK-SAME:     %[[t1:.*]]: memref<?x?xf32, strided{{.*}}>, %[[t2:.*]]: memref<?xf32, strided{{.*}}>, %[[t3:.*]]: memref<?x?xf32, strided{{.*}}>
 func.func @linalg_op_bufferizes_inplace_with_input(
     %t1: tensor<?x?xf32> {bufferization.writable = true},
     %t2: tensor<?xf32> {bufferization.writable = true},
     %t3: tensor<?x?xf32> {bufferization.writable = true},
     %s1: index, %s2: index, %cst: f32)
   -> tensor<?x?xf32>
 {
   // CHECK: linalg.generic {{.*}} ins(%[[t1]], %[[t2]] : {{.*}}) outs(%[[t3]] : {{.*}})
   %r = linalg.generic {
     indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                      affine_map<(d0, d1) -> (d1)>,
                      affine_map<(d0, d1)-> (d0, d1)>],
     iterator_types = ["parallel", "parallel"]}
     ins(%t1, %t2 : tensor<?x?xf32>, tensor<?xf32>)
     outs(%t3 : tensor<?x?xf32>) {
       ^bb0(%arg0 : f32, %arg1 : f32, %arg2 : f32) :
         %add = arith.addf %arg0, %arg1 : f32
         linalg.yield %add : f32
     } -> tensor<?x?xf32>
   return %r : tensor<?x?xf32>
 }

 // -----

 #accesses = [
   affine_map<(i) -> (i)>
 ]
 #trait = {
   indexing_maps = #accesses,
   iterator_types = ["parallel"]
 }

 // CHECK-LABEL: func @op_is_reading_but_following_ops_are_not
 //  CHECK-SAME:     %[[t0:.*]]: memref<?xf32
 func.func @op_is_reading_but_following_ops_are_not(
     %t0 : tensor<?xf32> {bufferization.writable = false},
     %cst : f32)
   -> tensor<?xf32>
 {
   // Make sure that a copy is inserted here.
   // CHECK: %[[ALLOC:.*]] = memref.alloc
   // CHECK: memref.copy %[[t0]], %[[ALLOC]]
   // CHECK: linalg.generic {{.*}} outs(%[[ALLOC]] : memref
   %r0 =linalg.generic #trait outs (%t0 : tensor<?xf32>) {
       ^bb(%0: f32) :
         %a = arith.addf %cst, %0 : f32
         linalg.yield %a : f32
     } -> (tensor<?xf32>)

   // CHECK: linalg.generic {{.*}} outs(%[[ALLOC]] : memref
   %r1 = linalg.generic #trait outs (%r0 : tensor<?xf32>) {
       ^bb(%0: f32) :
         linalg.yield %cst : f32
     } -> (tensor<?xf32>)

   // CHECK: return %[[ALLOC]]
   return %r1 : tensor<?xf32>
 }

 // -----

 // CHECK-LABEL: func @map_binary
 // CHECK-SAME:  %[[LHS:[0-9a-zA-Z]*]]: memref<64xf32
 // CHECK-SAME:  %[[RHS:[0-9a-zA-Z]*]]: memref<64xf32
 func.func @map_binary(%lhs: tensor<64xf32>, %rhs: tensor<64xf32>,
                       %init: tensor<64xf32>) -> tensor<64xf32> {
    // CHECK:      linalg.map { arith.addf } ins(%[[LHS]], %[[RHS]] : memref<64xf32
    %add = linalg.map
           ins(%lhs, %rhs: tensor<64xf32>, tensor<64xf32>)
           outs(%init:tensor<64xf32>)
           (%lhs_elem: f32, %rhs_elem: f32) {
             %0 = arith.addf %lhs_elem, %rhs_elem: f32
             linalg.yield %0: f32
           }
   func.return %add : tensor<64xf32>
 }

 // -----

 // CHECK-LABEL: func @reduce
 // CHECK-SAME:  %[[INPUT:.*]]: memref<16x32x64xf32
 func.func @reduce(%input: tensor<16x32x64xf32>,
                   %init: tensor<16x64xf32>) -> tensor<16x64xf32> {
   // CHECK:     linalg.reduce { arith.addf } ins(%[[INPUT]] : memref<16x32x64xf32
   %reduce = linalg.reduce
       ins(%input:tensor<16x32x64xf32>)
       outs(%init:tensor<16x64xf32>)
       dimensions = [1]
       (%in: f32, %out: f32) {
         %0 = arith.addf %out, %in: f32
         linalg.yield %0: f32
       }
   func.return %reduce : tensor<16x64xf32>
 }

 // -----

 // CHECK-LABEL: func @transpose
 // CHECK-SAME:  %[[ARG0:.*]]: memref<16x32x64xf32
 func.func @transpose(%input: tensor<16x32x64xf32>,
                      %init: tensor<32x64x16xf32>) -> tensor<32x64x16xf32> {
   // CHECK:      linalg.transpose ins(%[[ARG0]] : memref<16x32x64xf32
   %transpose = linalg.transpose
       ins(%input:tensor<16x32x64xf32>)
       outs(%init:tensor<32x64x16xf32>)
       permutation = [1, 2, 0]
   func.return %transpose : tensor<32x64x16xf32>
 }

 // -----

 // CHECK-LABEL: func @broadcast
 // CHECK-SAME:  %[[ARG0:.*]]: memref<8x32xf32
 func.func @broadcast(%input: tensor<8x32xf32>,
                      %init: tensor<8x16x32xf32>) -> tensor<8x16x32xf32> {
   %bcast = linalg.broadcast
       ins(%input:tensor<8x32xf32>)
       outs(%init:tensor<8x16x32xf32>)
       dimensions = [1]
   func.return %bcast : tensor<8x16x32xf32>
 }

 // -----

 //===----------------------------------------------------------------------===//
 // AllocTensorOp elimination would produce SSA violations for the example below.
 //===----------------------------------------------------------------------===//

 func.func @depthwise_conv_1d_nwc_wc(%arg0: index, %arg1: index, %arg2: tensor<8x18x32xf32>)
     -> tensor<?x1x6x8xf32> {
   %c0 = arith.constant 0 : index
   %c32 = arith.constant 32 : index
   %c8 = arith.constant 8 : index
   %0 = bufferization.alloc_tensor() : tensor<4x1x6x8xf32>
   %1 = tensor.cast %0 : tensor<4x1x6x8xf32> to tensor<?x1x6x8xf32>
   %2 = bufferization.alloc_tensor() : tensor<1x6x8xf32>
   %3 = scf.for %arg3 = %c0 to %c32 step %c8 iter_args(%arg4 = %1) -> (tensor<?x1x6x8xf32>) {
     %4 = affine.apply affine_map<(d0) -> (d0 ceildiv 8)>(%arg3)
     %5 = tensor.insert_slice %2 into %arg4[%4,0, 0, 0] [1, 1, 6, 8] [1, 1, 1, 1] :
       tensor<1x6x8xf32> into tensor<?x1x6x8xf32>
     scf.yield %5 : tensor<?x1x6x8xf32>
   }
   return %3 : tensor<?x1x6x8xf32>
 }

 // -----

 // CHECK-LABEL: func @do_not_copy_alloc_tensors(
 func.func @do_not_copy_alloc_tensors(%f1: f32, %f2: f32, %idx: index)
   -> (tensor<5xf32>, tensor<5xf32>)
 {
   // CHECK: memref.alloc
   // CHECK: memref.alloc
   // CHECK-NOT: copy
   // CHECK: memref.store
   // CHECK: memref.store
   %0 = bufferization.alloc_tensor() : tensor<5xf32>
   %1 = tensor.insert %f1 into %0[%idx] : tensor<5xf32>
   %2 = tensor.insert %f2 into %0[%idx] : tensor<5xf32>
   return %1, %2 : tensor<5xf32>, tensor<5xf32>
 }
	// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries" -canonicalize -buffer-loop-hoisting -drop-equivalent-buffer-results -split-input-file \| FileCheck %s

	// Run fuzzer with different seeds.
	// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23 bufferize-function-boundaries" -split-input-file -o /dev/null
	// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=59 bufferize-function-boundaries" -split-input-file -o /dev/null
	// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=91 bufferize-function-boundaries" -split-input-file -o /dev/null

	// Test bufferization using memref types that have no layout map.
	// RUN: mlir-opt %s -one-shot-bufferize="unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries" -drop-equivalent-buffer-results -split-input-file \| FileCheck %s --check-prefix=CHECK-NO-LAYOUT-MAP

	// TODO: Some test cases from this file should be moved to other dialects.

	// CHECK-LABEL: func @fill_inplace(
	// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>
	// CHECK-NO-LAYOUT-MAP-LABEL: func @fill_inplace(%{{.*}}: memref<?xf32>) {
	func.func @fill_inplace(
	%A : tensor<?xf32> {bufferization.writable = true})
	-> tensor<?xf32>
	{
	// CHECK: %[[F0:.*]] = arith.constant 0.000000e+00 : f32
	%f0 = arith.constant 0.0 : f32

	/// Inplaceable, no alloc
	// CHECK-NOT: alloc
	// CHECK: linalg.fill ins(%[[F0]] : f32) outs(%[[A]] : memref<?xf32, strided<[?], offset: ?>>)
	%r = linalg.fill ins(%f0 : f32) outs(%A : tensor<?xf32>) -> tensor<?xf32>

	// CHECK: return
	// CHECK-NOT: tensor
	return %r: tensor<?xf32>
	}

	// -----

	/// No bufferization.writable flag, must allocate.
	// CHECK-LABEL: func @not_inplace(
	// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>) -> memref<?xf32> {
	// CHECK-NO-LAYOUT-MAP-LABEL: func @not_inplace(%{{.*}}: memref<?xf32>) -> memref<?xf32>
	func.func @not_inplace(
	%A : tensor<?xf32> {bufferization.writable = false})
	-> tensor<?xf32>
	{
	// CHECK: %[[F0:.*]] = arith.constant 0.000000e+00 : f32
	%f0 = arith.constant 0.0 : f32

	// CHECK: %[[D0:.]] = memref.dim %[[A]], {{.}} : memref<?xf32, strided<[?], offset: ?>>
	// CHECK: %[[ALLOC:.*]] = memref.alloc(%[[D0]]) {alignment = 64 : i64} : memref<?xf32>
	// CHECK: linalg.fill ins(%[[F0]] : f32) outs(%[[ALLOC]] : memref<?xf32>)
	%r = linalg.fill ins(%f0 : f32) outs(%A : tensor<?xf32>) -> tensor<?xf32>

	// CHECK-NOT: dealloc
	// CHECK: return %[[ALLOC]] : memref<?xf32>
	return %r: tensor<?xf32>
	}

	// -----


	// CHECK-LABEL: func @not_inplace
	// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<?x?xf32, strided<[?, ?], offset: ?>>) {
	// CHECK-NO-LAYOUT-MAP-LABEL: func @not_inplace(%{{.*}}: memref<?x?xf32>) {
	func.func @not_inplace(
	%A : tensor<?x?xf32> {bufferization.writable = true})
	-> tensor<?x?xf32>
	{
	%f0 = arith.constant 0.0 : f32

	/// Cross-op multiple uses of %A, the first op which has interfering reads must alloc.
	// CHECK: %[[ALLOC:.*]] = memref.alloc
	// CHECK: linalg.fill ins({{.}}{{.}}outs(%[[ALLOC]]
	%f = linalg.fill ins(%f0 : f32) outs(%A : tensor<?x?xf32>) -> tensor<?x?xf32>

	/// The second op has no interfering reads and can reuse.
	// CHECK-NOT: alloc
	// CHECK: linalg.matmul ins(%[[ALLOC]], %[[ALLOC]]{{.*}}) outs(%[[A]]
	%r = linalg.matmul ins(%f, %f: tensor<?x?xf32>, tensor<?x?xf32>)
	outs(%A: tensor<?x?xf32>)
	-> tensor<?x?xf32>

	// CHECK: return
	// CHECK-NOT: tensor
	return %r: tensor<?x?xf32>
	}

	// -----

	// CHECK-LABEL: func @not_inplace
	func.func @not_inplace(
	%A : tensor<?x?xf32> {bufferization.writable = true}) -> tensor<?x?xf32> {
	/// Within op multiple uses of %A, must alloc.
	// CHECK: alloc
	%r = linalg.matmul ins(%A, %A: tensor<?x?xf32>, tensor<?x?xf32>)
	outs(%A: tensor<?x?xf32>)
	-> tensor<?x?xf32>
	// CHECK-NOT: dealloc
	return %r: tensor<?x?xf32>
	}
	// -----

	// CHECK-LABEL: func @vec_inplace
	func.func @vec_inplace(
	%A : tensor<?xf32> {bufferization.writable = true}, %vec : vector<4xf32>)
	-> tensor<?xf32>
	{
	%c0 = arith.constant 0 : index

	// CHECK-NOT: alloc
	%r = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor<?xf32>

	// CHECK: return
	// CHECK-NOT: tensor
	return %r: tensor<?xf32>
	}

	// -----

	// CHECK-LABEL: func @vec_not_inplace
	// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>
	func.func @vec_not_inplace(
	%A : tensor<?xf32> {bufferization.writable = true}, %vec : vector<4xf32>)
	-> (tensor<?xf32>, tensor<?xf32>)
	{
	%c0 = arith.constant 0 : index
	%c1 = arith.constant 1 : index

	/// Cross-op multiple uses of %A, the first vector.transfer which has interfering reads must alloc.
	// CHECK: %[[ALLOC:.*]] = memref.alloc
	// CHECK: memref.copy {{.*}}, %[[ALLOC]]
	// CHECK-NEXT: vector.transfer_write {{.*}}, %[[ALLOC]]
	%r0 = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor<?xf32>

	/// The second vector.transfer has no interfering reads and can reuse the buffer.
	// CHECK-NOT: alloc
	// CHECK-NEXT: vector.transfer_write {{.*}}, %[[A]]
	%r1 = vector.transfer_write %vec, %A[%c1] : vector<4xf32>, tensor<?xf32>

	// CHECK: return
	// CHECK-NOT: tensor
	return %r0, %r1: tensor<?xf32>, tensor<?xf32>
	}

	// -----

	// CHECK: func @matmul(
	// CHECK-SAME: %[[A:[0-9a-zA-Z]*]]: memref<128x256xf32>
	// CHECK-SAME: %[[B:[0-9a-zA-Z]*]]: memref<256x192xf32>
	// CHECK-SAME: %[[C:[0-9a-zA-Z]*]]: memref<128x192xf32>
	func.func @matmul(
	%A: tensor<128x256xf32> {bufferization.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, bufferization.writable = false},
	%B: tensor<256x192xf32> {bufferization.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, bufferization.writable = false},
	%C: tensor<128x192xf32> {bufferization.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, bufferization.writable = true})
	-> tensor<128x192xf32> {
	%c0 = arith.constant 0 : index
	%c256 = arith.constant 256 : index
	%c32 = arith.constant 32 : index
	%cst = arith.constant 0.000000e+00 : f32
	%c128 = arith.constant 128 : index
	%c192 = arith.constant 192 : index
	%c8 = arith.constant 8 : index
	%c16 = arith.constant 16 : index

	// Hoisted alloc.
	// CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<8x16xf32>

	// CHECK: scf.for %[[I:.*]] =
	%0 = scf.for %arg3 = %c0 to %c128 step %c8 iter_args(%arg4 = %C) -> (tensor<128x192xf32>) {
	%1 = tensor.extract_slice %A[%arg3, 0] [8, 256] [1, 1] :
	tensor<128x256xf32> to tensor<8x256xf32>

	// CHECK: scf.for %[[J:.*]] =
	%2 = scf.for %arg5 = %c0 to %c192 step %c16 iter_args(%arg6 = %arg4) -> (tensor<128x192xf32>) {
	%3 = tensor.extract_slice %B[0, %arg5] [256, 16] [1, 1] :
	tensor<256x192xf32> to tensor<256x16xf32>

	// Insert an artificial out-of-place buffer by extracting from %C instead
	// of %arg6.
	%4 = tensor.extract_slice %C[%arg3, %arg5] [8, 16] [1, 1] :
	tensor<128x192xf32> to tensor<8x16xf32>

	// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[ALLOC]]
	%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<8x16xf32>) -> tensor<8x16xf32>

	// CHECK: scf.for %[[K:.*]] =
	%6 = scf.for %arg7 = %c0 to %c256 step %c32 iter_args(%arg8 = %5) -> (tensor<8x16xf32>) {
	%8 = tensor.extract_slice %1[0, %arg7] [8, 32] [1, 1] :
	tensor<8x256xf32> to tensor<8x32xf32>
	%9 = tensor.extract_slice %3[%arg7, 0] [32, 16] [1, 1] :
	tensor<256x16xf32> to tensor<32x16xf32>

	// linalg.matmul is inplace as well as the enclosing scf.for.
	// CHECK: linalg.matmul ins({{.*}} outs(%[[ALLOC]]
	%10 = linalg.matmul ins(%8, %9 : tensor<8x32xf32>, tensor<32x16xf32>)
	outs(%arg8 : tensor<8x16xf32>)
	-> tensor<8x16xf32>
	scf.yield %10 : tensor<8x16xf32>
	}

	// insert_slice is inplace but its source comes from an equivalent buffer
	// that is not in place. So we must insert a copy of the small buffer into
	// the bigger buffer.
	// CHECK: %[[T:.*]] = memref.subview %[[C]][%[[I]], %[[J]]] [8, 16] [1, 1]
	// CHECK: memref.copy %[[ALLOC]], %[[T]]
	%7 = tensor.insert_slice %6 into %arg6[%arg3, %arg5] [8, 16] [1, 1] :
	tensor<8x16xf32> into tensor<128x192xf32>

	scf.yield %7 : tensor<128x192xf32>
	}
	scf.yield %2 : tensor<128x192xf32>
	}

	return %0 : tensor<128x192xf32>
	}

	// -----

	/// This test just checks the produced IR is valid and does not have dominance
	/// errors in the def-use chains.

	// CHECK-LABEL: func @dominance_violation_bug_1
	func.func @dominance_violation_bug_1(
	%A : tensor<?x?xf32> {bufferization.writable = false},
	%idx : index)
	-> tensor<?x?xf32>
	{
	%f0 = arith.constant 0.0 : f32

	%sA = tensor.extract_slice %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
	%ssA = tensor.extract_slice %sA[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
	%FA = linalg.fill ins(%f0 : f32) outs(%ssA : tensor<4x4xf32>) -> tensor<4x4xf32>
	%rsA = tensor.insert_slice %FA into %sA[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
	%rA = tensor.insert_slice %rsA into %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>

	return %rA : tensor<?x?xf32>
	}

	// -----

	func.func @gather_like(
	%arg0 : tensor<?x?xf32> {bufferization.writable = false},
	%arg1 : tensor<?xi32> {bufferization.writable = false},
	%arg2 : tensor<?x?xf32> {bufferization.writable = true})
	-> tensor<?x?xf32>
	{
	%0 = linalg.generic {
	indexing_maps = [affine_map<(d0, d1) -> (d0)>,
	affine_map<(d0, d1) -> (d0, d1)>],
	iterator_types = ["parallel", "parallel"]}
	ins(%arg1 : tensor<?xi32>) outs(%arg2 : tensor<?x?xf32>) {
	^bb0(%arg3: i32, %arg4 : f32):
	%iv1 = linalg.index 1 : index
	%1 = arith.index_cast %arg3: i32 to index
	%2 = tensor.extract %arg0[%1, %iv1] : tensor<?x?xf32>
	linalg.yield %2 : f32
	} -> tensor<?x?xf32>
	return %0 : tensor<?x?xf32>
	}
	// CHECK-LABEL: func @gather_like(
	// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: memref<?x?xf32,
	// CHECK-SAME: %[[ARG1:.+]]: memref<?xi32
	// CHECK-SAME: %[[ARG2:.+]]: memref<?x?xf32
	// CHECK-SAME: ) {
	// CHECK: linalg.generic
	// CHECK-SAME: ins(%[[ARG1]] :
	// CHECK-SAME: outs(%[[ARG2]] :
	// CHECK: %[[YIELD:.+]] = memref.load %[[ARG0]]
	// CHECK: linalg.yield %[[YIELD]]

	// -----

	// CHECK-LABEL: func @linalg_op_bufferizes_inplace_with_input
	// CHECK-SAME: %[[t1:.]]: memref<?x?xf32, strided{{.}}>, %[[t2:.]]: memref<?xf32, strided{{.}}>, %[[t3:.]]: memref<?x?xf32, strided{{.}}>
	func.func @linalg_op_bufferizes_inplace_with_input(
	%t1: tensor<?x?xf32> {bufferization.writable = true},
	%t2: tensor<?xf32> {bufferization.writable = true},
	%t3: tensor<?x?xf32> {bufferization.writable = true},
	%s1: index, %s2: index, %cst: f32)
	-> tensor<?x?xf32>
	{
	// CHECK: linalg.generic {{.}} ins(%[[t1]], %[[t2]] : {{.}}) outs(%[[t3]] : {{.*}})
	%r = linalg.generic {
	indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
	affine_map<(d0, d1) -> (d1)>,
	affine_map<(d0, d1)-> (d0, d1)>],
	iterator_types = ["parallel", "parallel"]}
	ins(%t1, %t2 : tensor<?x?xf32>, tensor<?xf32>)
	outs(%t3 : tensor<?x?xf32>) {
	^bb0(%arg0 : f32, %arg1 : f32, %arg2 : f32) :
	%add = arith.addf %arg0, %arg1 : f32
	linalg.yield %add : f32
	} -> tensor<?x?xf32>
	return %r : tensor<?x?xf32>
	}

	// -----

	#accesses = [
	affine_map<(i) -> (i)>
	]
	#trait = {
	indexing_maps = #accesses,
	iterator_types = ["parallel"]
	}

	// CHECK-LABEL: func @op_is_reading_but_following_ops_are_not
	// CHECK-SAME: %[[t0:.*]]: memref<?xf32
	func.func @op_is_reading_but_following_ops_are_not(
	%t0 : tensor<?xf32> {bufferization.writable = false},
	%cst : f32)
	-> tensor<?xf32>
	{
	// Make sure that a copy is inserted here.
	// CHECK: %[[ALLOC:.*]] = memref.alloc
	// CHECK: memref.copy %[[t0]], %[[ALLOC]]
	// CHECK: linalg.generic {{.*}} outs(%[[ALLOC]] : memref
	%r0 =linalg.generic #trait outs (%t0 : tensor<?xf32>) {
	^bb(%0: f32) :
	%a = arith.addf %cst, %0 : f32
	linalg.yield %a : f32
	} -> (tensor<?xf32>)

	// CHECK: linalg.generic {{.*}} outs(%[[ALLOC]] : memref
	%r1 = linalg.generic #trait outs (%r0 : tensor<?xf32>) {
	^bb(%0: f32) :
	linalg.yield %cst : f32
	} -> (tensor<?xf32>)

	// CHECK: return %[[ALLOC]]
	return %r1 : tensor<?xf32>
	}

	// -----

	// CHECK-LABEL: func @map_binary
	// CHECK-SAME: %[[LHS:[0-9a-zA-Z]*]]: memref<64xf32
	// CHECK-SAME: %[[RHS:[0-9a-zA-Z]*]]: memref<64xf32
	func.func @map_binary(%lhs: tensor<64xf32>, %rhs: tensor<64xf32>,
	%init: tensor<64xf32>) -> tensor<64xf32> {
	// CHECK: linalg.map { arith.addf } ins(%[[LHS]], %[[RHS]] : memref<64xf32
	%add = linalg.map
	ins(%lhs, %rhs: tensor<64xf32>, tensor<64xf32>)
	outs(%init:tensor<64xf32>)
	(%lhs_elem: f32, %rhs_elem: f32) {
	%0 = arith.addf %lhs_elem, %rhs_elem: f32
	linalg.yield %0: f32
	}
	func.return %add : tensor<64xf32>
	}

	// -----

	// CHECK-LABEL: func @reduce
	// CHECK-SAME: %[[INPUT:.*]]: memref<16x32x64xf32
	func.func @reduce(%input: tensor<16x32x64xf32>,
	%init: tensor<16x64xf32>) -> tensor<16x64xf32> {
	// CHECK: linalg.reduce { arith.addf } ins(%[[INPUT]] : memref<16x32x64xf32
	%reduce = linalg.reduce
	ins(%input:tensor<16x32x64xf32>)
	outs(%init:tensor<16x64xf32>)
	dimensions = [1]
	(%in: f32, %out: f32) {
	%0 = arith.addf %out, %in: f32
	linalg.yield %0: f32
	}
	func.return %reduce : tensor<16x64xf32>
	}

	// -----

	// CHECK-LABEL: func @transpose
	// CHECK-SAME: %[[ARG0:.*]]: memref<16x32x64xf32
	func.func @transpose(%input: tensor<16x32x64xf32>,
	%init: tensor<32x64x16xf32>) -> tensor<32x64x16xf32> {
	// CHECK: linalg.transpose ins(%[[ARG0]] : memref<16x32x64xf32
	%transpose = linalg.transpose
	ins(%input:tensor<16x32x64xf32>)
	outs(%init:tensor<32x64x16xf32>)
	permutation = [1, 2, 0]
	func.return %transpose : tensor<32x64x16xf32>
	}

	// -----

	// CHECK-LABEL: func @broadcast
	// CHECK-SAME: %[[ARG0:.*]]: memref<8x32xf32
	func.func @broadcast(%input: tensor<8x32xf32>,
	%init: tensor<8x16x32xf32>) -> tensor<8x16x32xf32> {
	%bcast = linalg.broadcast
	ins(%input:tensor<8x32xf32>)
	outs(%init:tensor<8x16x32xf32>)
	dimensions = [1]
	func.return %bcast : tensor<8x16x32xf32>
	}

	// -----

	//===----------------------------------------------------------------------===//
	// AllocTensorOp elimination would produce SSA violations for the example below.
	//===----------------------------------------------------------------------===//

	func.func @depthwise_conv_1d_nwc_wc(%arg0: index, %arg1: index, %arg2: tensor<8x18x32xf32>)
	-> tensor<?x1x6x8xf32> {
	%c0 = arith.constant 0 : index
	%c32 = arith.constant 32 : index
	%c8 = arith.constant 8 : index
	%0 = bufferization.alloc_tensor() : tensor<4x1x6x8xf32>
	%1 = tensor.cast %0 : tensor<4x1x6x8xf32> to tensor<?x1x6x8xf32>
	%2 = bufferization.alloc_tensor() : tensor<1x6x8xf32>
	%3 = scf.for %arg3 = %c0 to %c32 step %c8 iter_args(%arg4 = %1) -> (tensor<?x1x6x8xf32>) {
	%4 = affine.apply affine_map<(d0) -> (d0 ceildiv 8)>(%arg3)
	%5 = tensor.insert_slice %2 into %arg4[%4,0, 0, 0] [1, 1, 6, 8] [1, 1, 1, 1] :
	tensor<1x6x8xf32> into tensor<?x1x6x8xf32>
	scf.yield %5 : tensor<?x1x6x8xf32>
	}
	return %3 : tensor<?x1x6x8xf32>
	}

	// -----

	// CHECK-LABEL: func @do_not_copy_alloc_tensors(
	func.func @do_not_copy_alloc_tensors(%f1: f32, %f2: f32, %idx: index)
	-> (tensor<5xf32>, tensor<5xf32>)
	{
	// CHECK: memref.alloc
	// CHECK: memref.alloc
	// CHECK-NOT: copy
	// CHECK: memref.store
	// CHECK: memref.store
	%0 = bufferization.alloc_tensor() : tensor<5xf32>
	%1 = tensor.insert %f1 into %0[%idx] : tensor<5xf32>
	%2 = tensor.insert %f2 into %0[%idx] : tensor<5xf32>
	return %1, %2 : tensor<5xf32>, tensor<5xf32>
	}