mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir - llvm-project - Git at Google

 // RUN: mlir-opt %s -split-input-file -linalg-fold-unit-extent-dims | FileCheck %s

 #accesses = [
   affine_map<(i, j, k, l, m) -> (i, k, m)>,
   affine_map<(i, j, k, l, m) -> (i, k, j, l, m)>
 ]

 #trait = {
   iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"],
   indexing_maps = #accesses,
   library_call = "some_external_func"
 }

 func @drop_one_trip_loops(%arg0 : tensor<?x1x?xf32>, %shape: tensor<?x1x?x1x?xf32>) -> tensor<?x1x?x1x?xf32> {
   %0 = linalg.generic #trait
      ins(%arg0 : tensor<?x1x?xf32>)
     outs(%shape : tensor<?x1x?x1x?xf32>) {
        ^bb0(%arg2 : f32, %arg3 : f32) :
          linalg.yield %arg2 : f32
        } -> tensor<?x1x?x1x?xf32>
   return %0 : tensor<?x1x?x1x?xf32>
 }
 //   CHECK-DAG: #[[$MAP2:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 //   CHECK-DAG: #[[$MAP3:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 // CHECK-LABEL: func @drop_one_trip_loops
 //       CHECK: linalg.tensor_reshape %{{.*}} {{\[}}[0, 1], [2]]
 //       CHECK: linalg.generic
 //  CHECK-SAME:   indexing_maps = [#[[$MAP2]], #[[$MAP3]]]
 //  CHECK-SAME:   iterator_types = ["parallel", "parallel", "parallel"]
 //       CHECK: linalg.tensor_reshape %{{.*}} {{\[}}[0, 1], [2, 3], [4]]

 // -----

 #accesses = [
   affine_map<(i, j, k, l, m) -> (i, k, m)>,
   affine_map<(i, j, k, l, m) -> (i, k, j, l, m)>
 ]

 #trait = {
   iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"],
   indexing_maps = #accesses,
   library_call = "some_external_func"
 }

 func @drop_one_trip_loops_indexed_generic
   (%arg0 : tensor<?x1x?xi32>, %shape: tensor<?x1x?x1x?xi32>) -> tensor<?x1x?x1x?xi32>
 {
   %0 = linalg.indexed_generic #trait
      ins(%arg0 : tensor<?x1x?xi32>)
     outs(%shape: tensor<?x1x?x1x?xi32>) {
        ^bb0(%arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index,
             %arg5 : index, %arg6 : i32, %arg7 : i32) :
          %1 = addi %arg1, %arg2 : index
          %2 = addi %1, %arg3 : index
          %3 = addi %2, %arg4 : index
          %4 = addi %3, %arg5 : index
          %5 = index_cast %4 : index to i32
          %6 = addi %5, %arg6 : i32
          linalg.yield %6 : i32
        } -> tensor<?x1x?x1x?xi32>
   return %0 : tensor<?x1x?x1x?xi32>
 }
 // CHECK-LABEL: func @drop_one_trip_loops_indexed_generic
 //       CHECK:   linalg.indexed_generic
 //       CHECK:   ^{{.+}}(
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index, %[[ARG2:[a-zA-Z0-9]+]]: index
 //  CHECK-SAME:     %[[ARG3:[a-zA-Z0-9]+]]: index, %[[ARG4:[a-zA-Z0-9]+]]: i32, %{{.*}}: i32)
 //       CHECK:     %[[T3:.+]] = addi %[[ARG1]], %[[ARG2]]
 //       CHECK:     %[[T4:.+]] = addi %[[T3]], %[[ARG3]]
 //       CHECK:     %[[T5:.+]] = index_cast %[[T4]] : index to i32
 //       CHECK:     %[[T6:.+]] = addi %[[T5]], %[[ARG4]] : i32
 //       CHECK:     linalg.yield %[[T6]] : i32

 // -----

 #accesses = [
   affine_map<(i, j, k, l, m) -> (i, k, m)>,
   affine_map<(i, j, k, l, m) -> (i, k, j, l, m)>
 ]

 #trait = {
   iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"],
   indexing_maps = #accesses,
   library_call = "some_external_func"
 }

 func @drop_one_trip_loops_indexed
   (%arg0 : tensor<?x1x?xi32>, %shape: tensor<?x1x?x1x?xi32>) -> tensor<?x1x?x1x?xi32>
 {
   %0 = linalg.generic #trait
      ins(%arg0 : tensor<?x1x?xi32>)
     outs(%shape: tensor<?x1x?x1x?xi32>) {
        ^bb0(%arg6 : i32, %arg7 : i32) :
          %idx0 = linalg.index 0 : index
          %idx1 = linalg.index 1 : index
          %idx2 = linalg.index 2 : index
          %idx3 = linalg.index 3 : index
          %idx4 = linalg.index 4 : index
          %1 = addi %idx0, %idx1 : index
          %2 = subi %1, %idx2 : index
          %3 = subi %2, %idx3 : index
          %4 = addi %3, %idx4 : index
          %5 = index_cast %4 : index to i32
          %6 = addi %5, %arg6 : i32
          linalg.yield %6 : i32
        } -> tensor<?x1x?x1x?xi32>
   return %0 : tensor<?x1x?x1x?xi32>
 }
 // The subtractions disappear the access map of the output tensor maps its unit
 // dimensions 1 and 3 to the index dimensions 2 and 3.
 // CHECK-LABEL: func @drop_one_trip_loops_indexed
 //       CHECK:   linalg.generic
 //       CHECK:   ^{{.+}}(
 //  CHECK-SAME:     %[[ARG4:[a-zA-Z0-9]+]]: i32, %{{.*}}: i32)
 //       CHECK:     %[[IDX0:.+]] = linalg.index 0 : index
 //       CHECK:     %[[IDX1:.+]] = linalg.index 1 : index
 //       CHECK:     %[[IDX2:.+]] = linalg.index 2 : index
 //       CHECK:     %[[T3:.+]] = addi %[[IDX0]], %[[IDX1]]
 //       CHECK:     %[[T4:.+]] = addi %[[T3]], %[[IDX2]]
 //       CHECK:     %[[T5:.+]] = index_cast %[[T4]] : index to i32
 //       CHECK:     %[[T6:.+]] = addi %[[T5]], %[[ARG4]] : i32
 //       CHECK:     linalg.yield %[[T6]] : i32

 // -----

 #map0 = affine_map<(i, j) -> (i, j)>
 #access = [#map0, #map0]
 #trait = {
   iterator_types = ["parallel", "parallel"],
   indexing_maps = #access,
   library_call = "some_external_func"
 }

 func @drop_all_loops(%arg0 : tensor<1x1xf32>) -> tensor<1x1xf32>
 {
   %0 = linalg.generic #trait
      ins(%arg0 : tensor<1x1xf32>)
     outs(%arg0 : tensor<1x1xf32>) {
        ^bb0(%arg1: f32, %arg2: f32) :
          linalg.yield %arg1 : f32
        } -> tensor<1x1xf32>
   return %0 : tensor<1x1xf32>
 }
 //       CHECK: #[[$MAP0:.*]] = affine_map<() -> ()>
 // CHECK-LABEL: func @drop_all_loops
 //       CHECK:   linalg.tensor_reshape %{{.*}} []
 //       CHECK:   linalg.generic
 //  CHECK-SAME:     indexing_maps = [#[[$MAP0]], #[[$MAP0]]]
 //  CHECK-SAME:     iterator_types = []

 // -----

 #map0 = affine_map<(i, j) -> (i, j)>
 #access = [#map0, #map0]
 #trait = {
   iterator_types = ["parallel", "parallel"],
   indexing_maps = #access,
   library_call = "some_external_func"
 }

 func @drop_all_loops_indexed_generic
   (%arg0 : tensor<1x1xi32>) -> tensor<1x1xi32>{
   %0 = linalg.indexed_generic #trait
      ins(%arg0 : tensor<1x1xi32>)
     outs(%arg0 : tensor<1x1xi32>) {
        ^bb0(%arg1 : index, %arg2 : index, %arg3: i32, %arg4: i32) :
          %1 = addi %arg1, %arg2 : index
          %2 = index_cast %1 : index to i32
          %3 = addi %2, %arg3 : i32
          linalg.yield %3 : i32
        } -> tensor<1x1xi32>
   return %0 : tensor<1x1xi32>
 }

 // CHECK-LABEL: func @drop_all_loops_indexed_generic
 //       CHECK:   linalg.indexed_generic
 //       CHECK:   ^{{.+}}(%[[ARG1:.+]]: i32, %[[ARG2:.+]]: i32)
 //       CHECK:     linalg.yield %[[ARG1]] : i32

 // -----

 #map0 = affine_map<(i, j) -> (i, j)>
 #access = [#map0, #map0]
 #trait = {
   iterator_types = ["parallel", "parallel"],
   indexing_maps = #access,
   library_call = "some_external_func"
 }

 func @drop_all_loops_indexed
   (%arg0 : tensor<1x1xi32>) -> tensor<1x1xi32>{
   %0 = linalg.generic #trait
      ins(%arg0 : tensor<1x1xi32>)
     outs(%arg0 : tensor<1x1xi32>) {
        ^bb0(%arg3: i32, %arg4: i32) :
          %idx0 = linalg.index 0 : index
          %idx1 = linalg.index 1 : index
          %1 = addi %idx0, %idx1 : index
          %2 = index_cast %1 : index to i32
          %3 = addi %2, %arg3 : i32
          linalg.yield %3 : i32
        } -> tensor<1x1xi32>
   return %0 : tensor<1x1xi32>
 }

 // CHECK-LABEL: func @drop_all_loops_indexed
 //       CHECK:   linalg.generic
 //       CHECK:   ^{{.+}}(%[[ARG1:.+]]: i32, %[[ARG2:.+]]: i32)
 //       CHECK:     linalg.yield %[[ARG1]] : i32

 // -----

 #accesses = [
   affine_map<(d0) -> (0, d0)>,
   affine_map<(d0) -> (d0)>
 ]

 #trait = {
   indexing_maps = #accesses,
   iterator_types = ["parallel"],
   library_call = "some_external_fn"
 }

 func @leading_dim_1_canonicalization(%arg0: tensor<1x5xf32>, %shape: tensor<5xf32>) -> tensor<5xf32> {
   %0 = linalg.generic #trait
      ins(%arg0 : tensor<1x5xf32>)
     outs(%shape : tensor<5xf32>) {
   ^bb0(%arg2: f32, %arg3: f32):     // no predecessors
     linalg.yield %arg2 : f32
   } -> tensor<5xf32>
   return %0 : tensor<5xf32>
 }
 //   CHECK: #[[$MAP1:.*]] = affine_map<(d0) -> (d0)>

 // CHECK-LABEL: func @leading_dim_1_canonicalization
 //       CHECK:   linalg.tensor_reshape %{{.*}} {{\[}}[0, 1]]
 //       CHECK:   linalg.generic
 //  CHECK-SAME:     indexing_maps = [#[[$MAP1]], #[[$MAP1]]]
 //  CHECK-SAME:     iterator_types = ["parallel"]

 // -----

 #accesses = [
   affine_map<(d0, d1) -> (0, d1)>,
   affine_map<(d0, d1) -> (d0, 0)>,
   affine_map<(d0, d1) -> (d0, d1)>
 ]

 #trait = {
   indexing_maps = #accesses,
   iterator_types = ["parallel", "parallel"],
   library_call = "some_external_fn"
 }

 func @broadcast_test(%arg0 : tensor<5xf32>, %arg1 : tensor<5xf32>, %shape : tensor<5x5xf32>) -> tensor<5x5xf32>
 {
   %0 = linalg.tensor_reshape %arg0 [[0, 1]] : tensor<5xf32> into tensor<1x5xf32>
   %1 = linalg.tensor_reshape %arg1 [[0, 1]] : tensor<5xf32> into tensor<5x1xf32>
   %2 = linalg.generic #trait
      ins(%0, %1 : tensor<1x5xf32>, tensor<5x1xf32>)
     outs(%shape : tensor<5x5xf32>) {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %3 = addf %arg3, %arg4 : f32
          linalg.yield %3 : f32
        } -> tensor<5x5xf32>
   return %2 : tensor<5x5xf32>
 }
 //   CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d1)>
 //   CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d0)>
 //   CHECK-DAG: #[[$MAP2:.*]] = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-LABEL: func @broadcast_test
 //   CHECK-NOT:   linalg.tensor_reshape
 //       CHECK:   linalg.generic
 //  CHECK-SAME:     indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]]
 //  CHECK-SAME:     iterator_types = ["parallel", "parallel"]
 //   CHECK-NOT:   linalg.tensor_reshape

 // -----

 #accesses = [
   affine_map<(d0, d1) -> (0, 0)>,
   affine_map<(d0, d1) -> (d0, d1)>
 ]

 #trait = {
   indexing_maps = #accesses,
   iterator_types = ["parallel", "parallel"],
   library_call = "some_external_fn"
 }

 func @broadcast_scalar(%arg0 : tensor<1x1xf32>, %shape : tensor<?x?xf32>) -> tensor<?x?xf32>
 {
    %0 = linalg.generic #trait
      ins(%arg0 : tensor<1x1xf32>)
     outs(%shape : tensor<?x?xf32>) {
       ^bb0(%arg2 : f32, %arg3 : f32):
         linalg.yield %arg2 : f32
    } -> tensor<?x?xf32>
    return %0 : tensor<?x?xf32>
 }
 //   CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1) -> ()>
 //   CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-LABEL: func @broadcast_scalar
 //  CHECK-SAME:   %[[ARG0:.*]]: tensor<1x1xf32>
 //       CHECK:   %[[A:.*]] = linalg.tensor_reshape %[[ARG0]] []
 //  CHECK-SAME:     tensor<1x1xf32> into tensor<f32>
 //       CHECK:   linalg.generic
 //  CHECK-SAME:     indexing_maps = [#[[$MAP0]], #[[$MAP1]]]
 //  CHECK-SAME:     iterator_types = ["parallel", "parallel"]
 //  CHECK-SAME:     %[[A]]

 // -----

 #map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d2)>
 func @fold_unit_dim_tensor_reshape_op(%arg0 : tensor<5xf32>) -> tensor<2x5xf32>
 {
   %1 = linalg.init_tensor [1, 2, 5] : tensor<1x2x5xf32>
   %2 = linalg.generic {i64, indexing_maps = [#map1, #map0],
     iterator_types = ["parallel", "parallel", "parallel"]}
     ins(%arg0 : tensor<5xf32>) outs(%1 : tensor<1x2x5xf32>) {
     ^bb0(%arg1: f32, %arg2: f32):  // no predecessors
       linalg.yield %arg1 : f32
     } -> tensor<1x2x5xf32>
   %3 = linalg.tensor_reshape %2 [[0, 1], [2]]
     : tensor<1x2x5xf32> into tensor<2x5xf32>
   return %3 : tensor<2x5xf32>
 }
 // CHECK-LABEL: func @fold_unit_dim_tensor_reshape_op
 //       CHECK:   %[[RESULT:.+]] = linalg.generic
 //       CHECK:   return %[[RESULT]]

 // -----

 func @fold_unit_dim_for_init_tensor(%input: tensor<1x1000xf32>) -> tensor<1xf32> {
   %cst = constant 0.0 : f32
   %init = linalg.init_tensor [1] : tensor<1xf32>
   %fill = linalg.fill(%init, %cst) : tensor<1xf32>, f32 -> tensor<1xf32>
   %add = linalg.generic {
       indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>],
       iterator_types = ["parallel", "reduction"]}
     ins(%input : tensor<1x1000xf32>)outs(%fill : tensor<1xf32>) {
   ^bb0(%arg1: f32, %arg2: f32):
     %1823 = addf %arg1, %arg2 : f32
     linalg.yield %1823 : f32
   } -> tensor<1xf32>
   return %add : tensor<1xf32>
 }


 //   CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0)>
 //   CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> ()>

 //       CHECK: func @fold_unit_dim_for_init_tensor


 //       CHECK: %[[INPUT_RESHAPE:.+]] = linalg.tensor_reshape %{{.+}} {{\[}}[0, 1]] : tensor<1x1000xf32> into tensor<1000xf32>
 //       CHECK: %[[INIT:.+]] = linalg.init_tensor [] : tensor<f32>
 //       CHECK: %[[FILL:.+]] = linalg.fill(%[[INIT]], %cst) : tensor<f32>, f32 -> tensor<f32>
 //       CHECK: %[[GENERIC:.+]] = linalg.generic
 //  CHECK-SAME:     indexing_maps = [#[[MAP1]], #[[MAP2]]]
 //  CHECK-SAME:     iterator_types = ["reduction"]
 //  CHECK-SAME:   ins(%[[INPUT_RESHAPE]] : tensor<1000xf32>)
 //  CHECK-SAME:   outs(%[[FILL]] : tensor<f32>)
 //       CHECK: %[[GENERIC_RESHAPE:.+]] = linalg.tensor_reshape %[[GENERIC]] [] : tensor<f32> into tensor<1xf32>
 //       CHECK: return %[[GENERIC_RESHAPE:.+]] : tensor<1xf32>


 // -----

 func @fold_subtensor(
     %arg0 : tensor<1x?x?x1x?x1x1xf32>, %arg1 : tensor<1x?x?x?x?x1x1xf32>,
     %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index,
     %arg6 : index, %arg7 : index) -> (tensor<1x?x?x1x?x1x1xf32>, tensor<1x?x?x1x?x1x1xf32>) {
   %0 = subtensor %arg0[0, %arg2, %arg3, 0, %arg4, 0, 0]
                       [1, %arg5, %arg6, 1, %arg7, 1, 1] [1, 1, 1, 1, 1, 1, 1] :
       tensor<1x?x?x1x?x1x1xf32> to tensor<1x?x?x1x?x1x1xf32>
   %1 = subtensor %arg1[%arg2, 0, %arg3, 0, 0, %arg4, 0]
                       [1, %arg5, %arg6, 1, %arg7, 1, 1] [1, 1, 1, 1, 1, 1, 1] :
       tensor<1x?x?x?x?x1x1xf32> to tensor<1x?x?x1x?x1x1xf32>
   return %0, %1 : tensor<1x?x?x1x?x1x1xf32>, tensor<1x?x?x1x?x1x1xf32>
 }
 //      CHECK: func @fold_subtensor
 // CHECK-SAME:   %[[ARG0:.+]]: tensor<1x?x?x1x?x1x1xf32>
 // CHECK-SAME:   %[[ARG1:.+]]: tensor<1x?x?x?x?x1x1xf32>
 //      CHECK:   %[[SUBTENSOR1:.+]] = subtensor %[[ARG0]]
 // CHECK-SAME:       to tensor<?x?x?xf32>
 //      CHECK:   %[[RESULT1:.+]] = linalg.tensor_reshape %[[SUBTENSOR1]]
 // CHECK-SAME:       [0, 1], [2], [3, 4, 5, 6]
 //      CHECK:   %[[SUBTENSOR2:.+]] = subtensor %[[ARG1]]
 // CHECK-SAME:       to tensor<?x?x?xf32>
 //      CHECK:   %[[RESULT2:.+]] = linalg.tensor_reshape %[[SUBTENSOR2]]
 // CHECK-SAME:       [0, 1], [2], [3, 4, 5, 6]
 //      CHECK:   return %[[RESULT1]], %[[RESULT2]]

 // -----

 func @unit_dim_for_reduction(%arg0: tensor<1x?x1x?xf32>) -> tensor<1x?xf32> {
   %cst = constant 1.000000e+00 : f32
   %c3 = constant 3 : index
   %0 = memref.dim %arg0, %c3 : tensor<1x?x1x?xf32>
   %1 = linalg.init_tensor [1, %0] : tensor<1x?xf32>
   %2 = linalg.fill(%1, %cst) : tensor<1x?xf32>, f32 -> tensor<1x?xf32>
   %3 = linalg.generic {
     indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
                      affine_map<(d0, d1, d2, d3) -> (d0, d1)>],
     iterator_types = ["parallel", "parallel", "reduction", "reduction"]}
     ins(%arg0 : tensor<1x?x1x?xf32>)
     outs(%2 : tensor<1x?xf32>) {
   ^bb0(%arg1: f32, %arg2: f32):  // no predecessors
     %4 = addf %arg1, %arg2 : f32
     linalg.yield %4 : f32
   } -> tensor<1x?xf32>
   return %3 : tensor<1x?xf32>
 }
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1) -> (d0, d1)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1) -> (d0)>
 //      CHECK: func @unit_dim_for_reduction
 // CHECK-SAME:   %[[ARG0:.+]]: tensor<1x?x1x?xf32>
 //  CHECK-DAG:   %[[RESHAPE:.+]] = linalg.tensor_reshape %[[ARG0]] {{\[}}[0, 1, 2], [3]]
 //      CHECK:   %[[INIT:.+]] = linalg.init_tensor [%{{.+}}] : tensor<?xf32>
 //      CHECK:   %[[FILL:.+]] = linalg.fill(%[[INIT]], %{{.+}})
 //      CHECK:   %[[RESULT:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[MAP2]], #[[MAP3]]]
 // CHECK-SAME:     iterator_types = ["parallel", "reduction"]
 // CHECK-SAME:     ins(%[[RESHAPE]] : tensor<?x?xf32>)
 // CHECK-SAME:     outs(%[[FILL]] : tensor<?xf32>)
 //      CHECK:   %[[RESULT_RESHAPE:.+]] = linalg.tensor_reshape %[[RESULT]] {{\[}}[0, 1]]
 //      CHECK:   return %[[RESULT_RESHAPE]]

 // -----

 func @unit_dim_for_reduction_keep_one(%arg0: tensor<1x?x1x1xf32>) -> tensor<1x1xf32> {
   %cst = constant 1.000000e+00 : f32
   %c3 = constant 3 : index
   %1 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
   %2 = linalg.fill(%1, %cst) : tensor<1x1xf32>, f32 -> tensor<1x1xf32>
   %3 = linalg.generic {
     indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
                      affine_map<(d0, d1, d2, d3) -> (d0, d1)>],
     iterator_types = ["parallel", "parallel", "reduction", "reduction"]}
     ins(%arg0 : tensor<1x?x1x1xf32>)
     outs(%2 : tensor<1x1xf32>) {
   ^bb0(%arg1: f32, %arg2: f32):  // no predecessors
     %4 = addf %arg1, %arg2 : f32
     linalg.yield %4 : f32
   } -> tensor<1x1xf32>
   return %3 : tensor<1x1xf32>
 }
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1) -> (d0, d1)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1) -> (d0)>
 //      CHECK: func @unit_dim_for_reduction_keep_one
 // CHECK-SAME:   %[[ARG0:.+]]: tensor<1x?x1x1xf32>
 //  CHECK-DAG:   %[[RESHAPE:.+]] = linalg.tensor_reshape %[[ARG0]] {{\[}}[0, 1, 2], [3]]
 //      CHECK:   %[[INIT:.+]] = linalg.init_tensor [1] : tensor<1xf32>
 //      CHECK:   %[[FILL:.+]] = linalg.fill(%[[INIT]], %{{.+}})
 //      CHECK:   %[[RESULT:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[MAP2]], #[[MAP3]]]
 // CHECK-SAME:     iterator_types = ["parallel", "reduction"]
 // CHECK-SAME:     ins(%[[RESHAPE]] : tensor<?x1xf32>)
 // CHECK-SAME:     outs(%[[FILL]] : tensor<1xf32>)
 //      CHECK:   %[[RESULT_RESHAPE:.+]] = linalg.tensor_reshape %[[RESULT]] {{\[}}[0, 1]]
 //      CHECK:   return %[[RESULT_RESHAPE]]

 // -----

 func @unit_dim_for_reduction_inner(%arg0: tensor<?x1x?x1xf32>) -> tensor<?x1xf32> {
   %cst = constant 1.000000e+00 : f32
   %c2 = constant 2 : index
   %0 = memref.dim %arg0, %c2 : tensor<?x1x?x1xf32>
   %1 = linalg.init_tensor [%0, 1] : tensor<?x1xf32>
   %2 = linalg.fill(%1, %cst) : tensor<?x1xf32>, f32 -> tensor<?x1xf32>
   %3 = linalg.generic {
     indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
                      affine_map<(d0, d1, d2, d3) -> (d0, d1)>],
     iterator_types = ["parallel", "parallel", "reduction", "reduction"]}
     ins(%arg0 : tensor<?x1x?x1xf32>)
     outs(%2 : tensor<?x1xf32>) {
   ^bb0(%arg1: f32, %arg2: f32):  // no predecessors
     %4 = addf %arg1, %arg2 : f32
     linalg.yield %4 : f32
   } -> tensor<?x1xf32>
   return %3 : tensor<?x1xf32>
 }
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1) -> (d0, d1)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1) -> (d0)>
 //      CHECK: func @unit_dim_for_reduction_inner
 // CHECK-SAME:   %[[ARG0:.+]]: tensor<?x1x?x1xf32>
 //  CHECK-DAG:   %[[RESHAPE:.+]] = linalg.tensor_reshape %[[ARG0]] {{\[}}[0, 1], [2, 3]]
 //      CHECK:   %[[INIT:.+]] = linalg.init_tensor [%{{.+}}] : tensor<?xf32>
 //      CHECK:   %[[FILL:.+]] = linalg.fill(%[[INIT]], %{{.+}})
 //      CHECK:   %[[RESULT:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[MAP2]], #[[MAP3]]]
 // CHECK-SAME:     iterator_types = ["parallel", "reduction"]
 // CHECK-SAME:     ins(%[[RESHAPE]] : tensor<?x?xf32>)
 // CHECK-SAME:     outs(%[[FILL]] : tensor<?xf32>)
 //      CHECK:   %[[RESULT_RESHAPE:.+]] = linalg.tensor_reshape %[[RESULT]] {{\[}}[0, 1]]
 //      CHECK:   return %[[RESULT_RESHAPE]]

 // -----

 func @subtensor_unit_dims(%arg0: tensor<1x3xf32>) -> tensor<1x1xf32> {
   %0 = subtensor %arg0[0, 2] [1, 1] [1, 1] : tensor<1x3xf32> to tensor<1x1xf32>
   return %0 : tensor<1x1xf32>
 }
 // CHECK-LABEL: func @subtensor_unit_dims
 //       CHECK:   %[[SUBTENSOR:.+]] = subtensor
 //  CHECK-SAME:     tensor<1x3xf32> to tensor<f32>
 //       CHECK:   %[[RESULT:.+]] = linalg.tensor_reshape %[[SUBTENSOR]] []
 //       CHECK:   return %[[RESULT]]

 // -----

 func @subtensor_insert_unit_dims(%arg0: tensor<1x3xf32>, %arg1: tensor<1x1xf32>) -> tensor<1x3xf32> {
   %0 = subtensor_insert %arg1 into %arg0[0, 2] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x3xf32>
   return %0 : tensor<1x3xf32>
 }
 // CHECK-LABEL: func @subtensor_insert_unit_dims
 //       CHECK:   %[[RESHAPE:.+]] = linalg.tensor_reshape %{{.+}} []
 //       CHECK:   %[[RESULT:.+]] = subtensor_insert %[[RESHAPE]]
 //  CHECK-SAME:     tensor<f32> into tensor<1x3xf32>
 //       CHECK:   return %[[RESULT]]
	// RUN: mlir-opt %s -split-input-file -linalg-fold-unit-extent-dims \| FileCheck %s

	#accesses = [
	affine_map<(i, j, k, l, m) -> (i, k, m)>,
	affine_map<(i, j, k, l, m) -> (i, k, j, l, m)>
	]

	#trait = {
	iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"],
	indexing_maps = #accesses,
	library_call = "some_external_func"
	}

	func @drop_one_trip_loops(%arg0 : tensor<?x1x?xf32>, %shape: tensor<?x1x?x1x?xf32>) -> tensor<?x1x?x1x?xf32> {
	%0 = linalg.generic #trait
	ins(%arg0 : tensor<?x1x?xf32>)
	outs(%shape : tensor<?x1x?x1x?xf32>) {
	^bb0(%arg2 : f32, %arg3 : f32) :
	linalg.yield %arg2 : f32
	} -> tensor<?x1x?x1x?xf32>
	return %0 : tensor<?x1x?x1x?xf32>
	}
	// CHECK-DAG: #[[$MAP2:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)>
	// CHECK-DAG: #[[$MAP3:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
	// CHECK-LABEL: func @drop_one_trip_loops
	// CHECK: linalg.tensor_reshape %{{.*}} {{\[}}[0, 1], [2]]
	// CHECK: linalg.generic
	// CHECK-SAME: indexing_maps = [#[[$MAP2]], #[[$MAP3]]]
	// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]
	// CHECK: linalg.tensor_reshape %{{.*}} {{\[}}[0, 1], [2, 3], [4]]

	// -----

	#accesses = [
	affine_map<(i, j, k, l, m) -> (i, k, m)>,
	affine_map<(i, j, k, l, m) -> (i, k, j, l, m)>
	]

	#trait = {
	iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"],
	indexing_maps = #accesses,
	library_call = "some_external_func"
	}

	func @drop_one_trip_loops_indexed_generic
	(%arg0 : tensor<?x1x?xi32>, %shape: tensor<?x1x?x1x?xi32>) -> tensor<?x1x?x1x?xi32>
	{
	%0 = linalg.indexed_generic #trait
	ins(%arg0 : tensor<?x1x?xi32>)
	outs(%shape: tensor<?x1x?x1x?xi32>) {
	^bb0(%arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index,
	%arg5 : index, %arg6 : i32, %arg7 : i32) :
	%1 = addi %arg1, %arg2 : index
	%2 = addi %1, %arg3 : index
	%3 = addi %2, %arg4 : index
	%4 = addi %3, %arg5 : index
	%5 = index_cast %4 : index to i32
	%6 = addi %5, %arg6 : i32
	linalg.yield %6 : i32
	} -> tensor<?x1x?x1x?xi32>
	return %0 : tensor<?x1x?x1x?xi32>
	}
	// CHECK-LABEL: func @drop_one_trip_loops_indexed_generic
	// CHECK: linalg.indexed_generic
	// CHECK: ^{{.+}}(
	// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, %[[ARG2:[a-zA-Z0-9]+]]: index
	// CHECK-SAME: %[[ARG3:[a-zA-Z0-9]+]]: index, %[[ARG4:[a-zA-Z0-9]+]]: i32, %{{.*}}: i32)
	// CHECK: %[[T3:.+]] = addi %[[ARG1]], %[[ARG2]]
	// CHECK: %[[T4:.+]] = addi %[[T3]], %[[ARG3]]
	// CHECK: %[[T5:.+]] = index_cast %[[T4]] : index to i32
	// CHECK: %[[T6:.+]] = addi %[[T5]], %[[ARG4]] : i32
	// CHECK: linalg.yield %[[T6]] : i32

	// -----

	#accesses = [
	affine_map<(i, j, k, l, m) -> (i, k, m)>,
	affine_map<(i, j, k, l, m) -> (i, k, j, l, m)>
	]

	#trait = {
	iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"],
	indexing_maps = #accesses,
	library_call = "some_external_func"
	}

	func @drop_one_trip_loops_indexed
	(%arg0 : tensor<?x1x?xi32>, %shape: tensor<?x1x?x1x?xi32>) -> tensor<?x1x?x1x?xi32>
	{
	%0 = linalg.generic #trait
	ins(%arg0 : tensor<?x1x?xi32>)
	outs(%shape: tensor<?x1x?x1x?xi32>) {
	^bb0(%arg6 : i32, %arg7 : i32) :
	%idx0 = linalg.index 0 : index
	%idx1 = linalg.index 1 : index
	%idx2 = linalg.index 2 : index
	%idx3 = linalg.index 3 : index
	%idx4 = linalg.index 4 : index
	%1 = addi %idx0, %idx1 : index
	%2 = subi %1, %idx2 : index
	%3 = subi %2, %idx3 : index
	%4 = addi %3, %idx4 : index
	%5 = index_cast %4 : index to i32
	%6 = addi %5, %arg6 : i32
	linalg.yield %6 : i32
	} -> tensor<?x1x?x1x?xi32>
	return %0 : tensor<?x1x?x1x?xi32>
	}
	// The subtractions disappear the access map of the output tensor maps its unit
	// dimensions 1 and 3 to the index dimensions 2 and 3.
	// CHECK-LABEL: func @drop_one_trip_loops_indexed
	// CHECK: linalg.generic
	// CHECK: ^{{.+}}(
	// CHECK-SAME: %[[ARG4:[a-zA-Z0-9]+]]: i32, %{{.*}}: i32)
	// CHECK: %[[IDX0:.+]] = linalg.index 0 : index
	// CHECK: %[[IDX1:.+]] = linalg.index 1 : index
	// CHECK: %[[IDX2:.+]] = linalg.index 2 : index
	// CHECK: %[[T3:.+]] = addi %[[IDX0]], %[[IDX1]]
	// CHECK: %[[T4:.+]] = addi %[[T3]], %[[IDX2]]
	// CHECK: %[[T5:.+]] = index_cast %[[T4]] : index to i32
	// CHECK: %[[T6:.+]] = addi %[[T5]], %[[ARG4]] : i32
	// CHECK: linalg.yield %[[T6]] : i32

	// -----

	#map0 = affine_map<(i, j) -> (i, j)>
	#access = [#map0, #map0]
	#trait = {
	iterator_types = ["parallel", "parallel"],
	indexing_maps = #access,
	library_call = "some_external_func"
	}

	func @drop_all_loops(%arg0 : tensor<1x1xf32>) -> tensor<1x1xf32>
	{
	%0 = linalg.generic #trait
	ins(%arg0 : tensor<1x1xf32>)
	outs(%arg0 : tensor<1x1xf32>) {
	^bb0(%arg1: f32, %arg2: f32) :
	linalg.yield %arg1 : f32
	} -> tensor<1x1xf32>
	return %0 : tensor<1x1xf32>
	}
	// CHECK: #[[$MAP0:.*]] = affine_map<() -> ()>
	// CHECK-LABEL: func @drop_all_loops
	// CHECK: linalg.tensor_reshape %{{.*}} []
	// CHECK: linalg.generic
	// CHECK-SAME: indexing_maps = [#[[$MAP0]], #[[$MAP0]]]
	// CHECK-SAME: iterator_types = []

	// -----

	#map0 = affine_map<(i, j) -> (i, j)>
	#access = [#map0, #map0]
	#trait = {
	iterator_types = ["parallel", "parallel"],
	indexing_maps = #access,
	library_call = "some_external_func"
	}

	func @drop_all_loops_indexed_generic
	(%arg0 : tensor<1x1xi32>) -> tensor<1x1xi32>{
	%0 = linalg.indexed_generic #trait
	ins(%arg0 : tensor<1x1xi32>)
	outs(%arg0 : tensor<1x1xi32>) {
	^bb0(%arg1 : index, %arg2 : index, %arg3: i32, %arg4: i32) :
	%1 = addi %arg1, %arg2 : index
	%2 = index_cast %1 : index to i32
	%3 = addi %2, %arg3 : i32
	linalg.yield %3 : i32
	} -> tensor<1x1xi32>
	return %0 : tensor<1x1xi32>
	}

	// CHECK-LABEL: func @drop_all_loops_indexed_generic
	// CHECK: linalg.indexed_generic
	// CHECK: ^{{.+}}(%[[ARG1:.+]]: i32, %[[ARG2:.+]]: i32)
	// CHECK: linalg.yield %[[ARG1]] : i32

	// -----

	#map0 = affine_map<(i, j) -> (i, j)>
	#access = [#map0, #map0]
	#trait = {
	iterator_types = ["parallel", "parallel"],
	indexing_maps = #access,
	library_call = "some_external_func"
	}

	func @drop_all_loops_indexed
	(%arg0 : tensor<1x1xi32>) -> tensor<1x1xi32>{
	%0 = linalg.generic #trait
	ins(%arg0 : tensor<1x1xi32>)
	outs(%arg0 : tensor<1x1xi32>) {
	^bb0(%arg3: i32, %arg4: i32) :
	%idx0 = linalg.index 0 : index
	%idx1 = linalg.index 1 : index
	%1 = addi %idx0, %idx1 : index
	%2 = index_cast %1 : index to i32
	%3 = addi %2, %arg3 : i32
	linalg.yield %3 : i32
	} -> tensor<1x1xi32>
	return %0 : tensor<1x1xi32>
	}

	// CHECK-LABEL: func @drop_all_loops_indexed
	// CHECK: linalg.generic
	// CHECK: ^{{.+}}(%[[ARG1:.+]]: i32, %[[ARG2:.+]]: i32)
	// CHECK: linalg.yield %[[ARG1]] : i32

	// -----

	#accesses = [
	affine_map<(d0) -> (0, d0)>,
	affine_map<(d0) -> (d0)>
	]

	#trait = {
	indexing_maps = #accesses,
	iterator_types = ["parallel"],
	library_call = "some_external_fn"
	}

	func @leading_dim_1_canonicalization(%arg0: tensor<1x5xf32>, %shape: tensor<5xf32>) -> tensor<5xf32> {
	%0 = linalg.generic #trait
	ins(%arg0 : tensor<1x5xf32>)
	outs(%shape : tensor<5xf32>) {
	^bb0(%arg2: f32, %arg3: f32): // no predecessors
	linalg.yield %arg2 : f32
	} -> tensor<5xf32>
	return %0 : tensor<5xf32>
	}
	// CHECK: #[[$MAP1:.*]] = affine_map<(d0) -> (d0)>

	// CHECK-LABEL: func @leading_dim_1_canonicalization
	// CHECK: linalg.tensor_reshape %{{.*}} {{\[}}[0, 1]]
	// CHECK: linalg.generic
	// CHECK-SAME: indexing_maps = [#[[$MAP1]], #[[$MAP1]]]
	// CHECK-SAME: iterator_types = ["parallel"]

	// -----

	#accesses = [
	affine_map<(d0, d1) -> (0, d1)>,
	affine_map<(d0, d1) -> (d0, 0)>,
	affine_map<(d0, d1) -> (d0, d1)>
	]

	#trait = {
	indexing_maps = #accesses,
	iterator_types = ["parallel", "parallel"],
	library_call = "some_external_fn"
	}

	func @broadcast_test(%arg0 : tensor<5xf32>, %arg1 : tensor<5xf32>, %shape : tensor<5x5xf32>) -> tensor<5x5xf32>
	{
	%0 = linalg.tensor_reshape %arg0 [[0, 1]] : tensor<5xf32> into tensor<1x5xf32>
	%1 = linalg.tensor_reshape %arg1 [[0, 1]] : tensor<5xf32> into tensor<5x1xf32>
	%2 = linalg.generic #trait
	ins(%0, %1 : tensor<1x5xf32>, tensor<5x1xf32>)
	outs(%shape : tensor<5x5xf32>) {
	^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
	%3 = addf %arg3, %arg4 : f32
	linalg.yield %3 : f32
	} -> tensor<5x5xf32>
	return %2 : tensor<5x5xf32>
	}
	// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d1)>
	// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d0)>
	// CHECK-DAG: #[[$MAP2:.*]] = affine_map<(d0, d1) -> (d0, d1)>
	// CHECK-LABEL: func @broadcast_test
	// CHECK-NOT: linalg.tensor_reshape
	// CHECK: linalg.generic
	// CHECK-SAME: indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]]
	// CHECK-SAME: iterator_types = ["parallel", "parallel"]
	// CHECK-NOT: linalg.tensor_reshape

	// -----

	#accesses = [
	affine_map<(d0, d1) -> (0, 0)>,
	affine_map<(d0, d1) -> (d0, d1)>
	]

	#trait = {
	indexing_maps = #accesses,
	iterator_types = ["parallel", "parallel"],
	library_call = "some_external_fn"
	}

	func @broadcast_scalar(%arg0 : tensor<1x1xf32>, %shape : tensor<?x?xf32>) -> tensor<?x?xf32>
	{
	%0 = linalg.generic #trait
	ins(%arg0 : tensor<1x1xf32>)
	outs(%shape : tensor<?x?xf32>) {
	^bb0(%arg2 : f32, %arg3 : f32):
	linalg.yield %arg2 : f32
	} -> tensor<?x?xf32>
	return %0 : tensor<?x?xf32>
	}
	// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1) -> ()>
	// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d0, d1)>
	// CHECK-LABEL: func @broadcast_scalar
	// CHECK-SAME: %[[ARG0:.*]]: tensor<1x1xf32>
	// CHECK: %[[A:.*]] = linalg.tensor_reshape %[[ARG0]] []
	// CHECK-SAME: tensor<1x1xf32> into tensor<f32>
	// CHECK: linalg.generic
	// CHECK-SAME: indexing_maps = [#[[$MAP0]], #[[$MAP1]]]
	// CHECK-SAME: iterator_types = ["parallel", "parallel"]
	// CHECK-SAME: %[[A]]

	// -----

	#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
	#map1 = affine_map<(d0, d1, d2) -> (d2)>
	func @fold_unit_dim_tensor_reshape_op(%arg0 : tensor<5xf32>) -> tensor<2x5xf32>
	{
	%1 = linalg.init_tensor [1, 2, 5] : tensor<1x2x5xf32>
	%2 = linalg.generic {i64, indexing_maps = [#map1, #map0],
	iterator_types = ["parallel", "parallel", "parallel"]}
	ins(%arg0 : tensor<5xf32>) outs(%1 : tensor<1x2x5xf32>) {
	^bb0(%arg1: f32, %arg2: f32): // no predecessors
	linalg.yield %arg1 : f32
	} -> tensor<1x2x5xf32>
	%3 = linalg.tensor_reshape %2 [[0, 1], [2]]
	: tensor<1x2x5xf32> into tensor<2x5xf32>
	return %3 : tensor<2x5xf32>
	}
	// CHECK-LABEL: func @fold_unit_dim_tensor_reshape_op
	// CHECK: %[[RESULT:.+]] = linalg.generic
	// CHECK: return %[[RESULT]]

	// -----

	func @fold_unit_dim_for_init_tensor(%input: tensor<1x1000xf32>) -> tensor<1xf32> {
	%cst = constant 0.0 : f32
	%init = linalg.init_tensor [1] : tensor<1xf32>
	%fill = linalg.fill(%init, %cst) : tensor<1xf32>, f32 -> tensor<1xf32>
	%add = linalg.generic {
	indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>],
	iterator_types = ["parallel", "reduction"]}
	ins(%input : tensor<1x1000xf32>)outs(%fill : tensor<1xf32>) {
	^bb0(%arg1: f32, %arg2: f32):
	%1823 = addf %arg1, %arg2 : f32
	linalg.yield %1823 : f32
	} -> tensor<1xf32>
	return %add : tensor<1xf32>
	}


	// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0)>
	// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> ()>

	// CHECK: func @fold_unit_dim_for_init_tensor


	// CHECK: %[[INPUT_RESHAPE:.+]] = linalg.tensor_reshape %{{.+}} {{\[}}[0, 1]] : tensor<1x1000xf32> into tensor<1000xf32>
	// CHECK: %[[INIT:.+]] = linalg.init_tensor [] : tensor<f32>
	// CHECK: %[[FILL:.+]] = linalg.fill(%[[INIT]], %cst) : tensor<f32>, f32 -> tensor<f32>
	// CHECK: %[[GENERIC:.+]] = linalg.generic
	// CHECK-SAME: indexing_maps = [#[[MAP1]], #[[MAP2]]]
	// CHECK-SAME: iterator_types = ["reduction"]
	// CHECK-SAME: ins(%[[INPUT_RESHAPE]] : tensor<1000xf32>)
	// CHECK-SAME: outs(%[[FILL]] : tensor<f32>)
	// CHECK: %[[GENERIC_RESHAPE:.+]] = linalg.tensor_reshape %[[GENERIC]] [] : tensor<f32> into tensor<1xf32>
	// CHECK: return %[[GENERIC_RESHAPE:.+]] : tensor<1xf32>


	// -----

	func @fold_subtensor(
	%arg0 : tensor<1x?x?x1x?x1x1xf32>, %arg1 : tensor<1x?x?x?x?x1x1xf32>,
	%arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index,
	%arg6 : index, %arg7 : index) -> (tensor<1x?x?x1x?x1x1xf32>, tensor<1x?x?x1x?x1x1xf32>) {
	%0 = subtensor %arg0[0, %arg2, %arg3, 0, %arg4, 0, 0]
	[1, %arg5, %arg6, 1, %arg7, 1, 1] [1, 1, 1, 1, 1, 1, 1] :
	tensor<1x?x?x1x?x1x1xf32> to tensor<1x?x?x1x?x1x1xf32>
	%1 = subtensor %arg1[%arg2, 0, %arg3, 0, 0, %arg4, 0]
	[1, %arg5, %arg6, 1, %arg7, 1, 1] [1, 1, 1, 1, 1, 1, 1] :
	tensor<1x?x?x?x?x1x1xf32> to tensor<1x?x?x1x?x1x1xf32>
	return %0, %1 : tensor<1x?x?x1x?x1x1xf32>, tensor<1x?x?x1x?x1x1xf32>
	}
	// CHECK: func @fold_subtensor
	// CHECK-SAME: %[[ARG0:.+]]: tensor<1x?x?x1x?x1x1xf32>
	// CHECK-SAME: %[[ARG1:.+]]: tensor<1x?x?x?x?x1x1xf32>
	// CHECK: %[[SUBTENSOR1:.+]] = subtensor %[[ARG0]]
	// CHECK-SAME: to tensor<?x?x?xf32>
	// CHECK: %[[RESULT1:.+]] = linalg.tensor_reshape %[[SUBTENSOR1]]
	// CHECK-SAME: [0, 1], [2], [3, 4, 5, 6]
	// CHECK: %[[SUBTENSOR2:.+]] = subtensor %[[ARG1]]
	// CHECK-SAME: to tensor<?x?x?xf32>
	// CHECK: %[[RESULT2:.+]] = linalg.tensor_reshape %[[SUBTENSOR2]]
	// CHECK-SAME: [0, 1], [2], [3, 4, 5, 6]
	// CHECK: return %[[RESULT1]], %[[RESULT2]]

	// -----

	func @unit_dim_for_reduction(%arg0: tensor<1x?x1x?xf32>) -> tensor<1x?xf32> {
	%cst = constant 1.000000e+00 : f32
	%c3 = constant 3 : index
	%0 = memref.dim %arg0, %c3 : tensor<1x?x1x?xf32>
	%1 = linalg.init_tensor [1, %0] : tensor<1x?xf32>
	%2 = linalg.fill(%1, %cst) : tensor<1x?xf32>, f32 -> tensor<1x?xf32>
	%3 = linalg.generic {
	indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
	affine_map<(d0, d1, d2, d3) -> (d0, d1)>],
	iterator_types = ["parallel", "parallel", "reduction", "reduction"]}
	ins(%arg0 : tensor<1x?x1x?xf32>)
	outs(%2 : tensor<1x?xf32>) {
	^bb0(%arg1: f32, %arg2: f32): // no predecessors
	%4 = addf %arg1, %arg2 : f32
	linalg.yield %4 : f32
	} -> tensor<1x?xf32>
	return %3 : tensor<1x?xf32>
	}
	// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1) -> (d0, d1)>
	// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1) -> (d0)>
	// CHECK: func @unit_dim_for_reduction
	// CHECK-SAME: %[[ARG0:.+]]: tensor<1x?x1x?xf32>
	// CHECK-DAG: %[[RESHAPE:.+]] = linalg.tensor_reshape %[[ARG0]] {{\[}}[0, 1, 2], [3]]
	// CHECK: %[[INIT:.+]] = linalg.init_tensor [%{{.+}}] : tensor<?xf32>
	// CHECK: %[[FILL:.+]] = linalg.fill(%[[INIT]], %{{.+}})
	// CHECK: %[[RESULT:.+]] = linalg.generic
	// CHECK-SAME: indexing_maps = [#[[MAP2]], #[[MAP3]]]
	// CHECK-SAME: iterator_types = ["parallel", "reduction"]
	// CHECK-SAME: ins(%[[RESHAPE]] : tensor<?x?xf32>)
	// CHECK-SAME: outs(%[[FILL]] : tensor<?xf32>)
	// CHECK: %[[RESULT_RESHAPE:.+]] = linalg.tensor_reshape %[[RESULT]] {{\[}}[0, 1]]
	// CHECK: return %[[RESULT_RESHAPE]]

	// -----

	func @unit_dim_for_reduction_keep_one(%arg0: tensor<1x?x1x1xf32>) -> tensor<1x1xf32> {
	%cst = constant 1.000000e+00 : f32
	%c3 = constant 3 : index
	%1 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
	%2 = linalg.fill(%1, %cst) : tensor<1x1xf32>, f32 -> tensor<1x1xf32>
	%3 = linalg.generic {
	indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
	affine_map<(d0, d1, d2, d3) -> (d0, d1)>],
	iterator_types = ["parallel", "parallel", "reduction", "reduction"]}
	ins(%arg0 : tensor<1x?x1x1xf32>)
	outs(%2 : tensor<1x1xf32>) {
	^bb0(%arg1: f32, %arg2: f32): // no predecessors
	%4 = addf %arg1, %arg2 : f32
	linalg.yield %4 : f32
	} -> tensor<1x1xf32>
	return %3 : tensor<1x1xf32>
	}
	// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1) -> (d0, d1)>
	// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1) -> (d0)>
	// CHECK: func @unit_dim_for_reduction_keep_one
	// CHECK-SAME: %[[ARG0:.+]]: tensor<1x?x1x1xf32>
	// CHECK-DAG: %[[RESHAPE:.+]] = linalg.tensor_reshape %[[ARG0]] {{\[}}[0, 1, 2], [3]]
	// CHECK: %[[INIT:.+]] = linalg.init_tensor [1] : tensor<1xf32>
	// CHECK: %[[FILL:.+]] = linalg.fill(%[[INIT]], %{{.+}})
	// CHECK: %[[RESULT:.+]] = linalg.generic
	// CHECK-SAME: indexing_maps = [#[[MAP2]], #[[MAP3]]]
	// CHECK-SAME: iterator_types = ["parallel", "reduction"]
	// CHECK-SAME: ins(%[[RESHAPE]] : tensor<?x1xf32>)
	// CHECK-SAME: outs(%[[FILL]] : tensor<1xf32>)
	// CHECK: %[[RESULT_RESHAPE:.+]] = linalg.tensor_reshape %[[RESULT]] {{\[}}[0, 1]]
	// CHECK: return %[[RESULT_RESHAPE]]

	// -----

	func @unit_dim_for_reduction_inner(%arg0: tensor<?x1x?x1xf32>) -> tensor<?x1xf32> {
	%cst = constant 1.000000e+00 : f32
	%c2 = constant 2 : index
	%0 = memref.dim %arg0, %c2 : tensor<?x1x?x1xf32>
	%1 = linalg.init_tensor [%0, 1] : tensor<?x1xf32>
	%2 = linalg.fill(%1, %cst) : tensor<?x1xf32>, f32 -> tensor<?x1xf32>
	%3 = linalg.generic {
	indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
	affine_map<(d0, d1, d2, d3) -> (d0, d1)>],
	iterator_types = ["parallel", "parallel", "reduction", "reduction"]}
	ins(%arg0 : tensor<?x1x?x1xf32>)
	outs(%2 : tensor<?x1xf32>) {
	^bb0(%arg1: f32, %arg2: f32): // no predecessors
	%4 = addf %arg1, %arg2 : f32
	linalg.yield %4 : f32
	} -> tensor<?x1xf32>
	return %3 : tensor<?x1xf32>
	}
	// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1) -> (d0, d1)>
	// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1) -> (d0)>
	// CHECK: func @unit_dim_for_reduction_inner
	// CHECK-SAME: %[[ARG0:.+]]: tensor<?x1x?x1xf32>
	// CHECK-DAG: %[[RESHAPE:.+]] = linalg.tensor_reshape %[[ARG0]] {{\[}}[0, 1], [2, 3]]
	// CHECK: %[[INIT:.+]] = linalg.init_tensor [%{{.+}}] : tensor<?xf32>
	// CHECK: %[[FILL:.+]] = linalg.fill(%[[INIT]], %{{.+}})
	// CHECK: %[[RESULT:.+]] = linalg.generic
	// CHECK-SAME: indexing_maps = [#[[MAP2]], #[[MAP3]]]
	// CHECK-SAME: iterator_types = ["parallel", "reduction"]
	// CHECK-SAME: ins(%[[RESHAPE]] : tensor<?x?xf32>)
	// CHECK-SAME: outs(%[[FILL]] : tensor<?xf32>)
	// CHECK: %[[RESULT_RESHAPE:.+]] = linalg.tensor_reshape %[[RESULT]] {{\[}}[0, 1]]
	// CHECK: return %[[RESULT_RESHAPE]]

	// -----

	func @subtensor_unit_dims(%arg0: tensor<1x3xf32>) -> tensor<1x1xf32> {
	%0 = subtensor %arg0[0, 2] [1, 1] [1, 1] : tensor<1x3xf32> to tensor<1x1xf32>
	return %0 : tensor<1x1xf32>
	}
	// CHECK-LABEL: func @subtensor_unit_dims
	// CHECK: %[[SUBTENSOR:.+]] = subtensor
	// CHECK-SAME: tensor<1x3xf32> to tensor<f32>
	// CHECK: %[[RESULT:.+]] = linalg.tensor_reshape %[[SUBTENSOR]] []
	// CHECK: return %[[RESULT]]

	// -----

	func @subtensor_insert_unit_dims(%arg0: tensor<1x3xf32>, %arg1: tensor<1x1xf32>) -> tensor<1x3xf32> {
	%0 = subtensor_insert %arg1 into %arg0[0, 2] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x3xf32>
	return %0 : tensor<1x3xf32>
	}
	// CHECK-LABEL: func @subtensor_insert_unit_dims
	// CHECK: %[[RESHAPE:.+]] = linalg.tensor_reshape %{{.+}} []
	// CHECK: %[[RESULT:.+]] = subtensor_insert %[[RESHAPE]]
	// CHECK-SAME: tensor<f32> into tensor<1x3xf32>
	// CHECK: return %[[RESULT]]