blob: ec8d43b23de7ce8b189e9f6c20f68e73f1b0801d [file] [log] [blame]
// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul fuse tile-sizes=5,4,7 tile-interchange=1,0,2 run-enable-pass=false" -cse -split-input-file | FileCheck --check-prefix=MATMUL %s
// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.generic fuse tile-sizes=5,4,7 tile-interchange=1,0,2 run-enable-pass=false" -cse -split-input-file | FileCheck --check-prefix=GENERIC %s
// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (5, -d0 + 24)>
// MATMUL-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (7, -d0 + 12)>
// MATMUL-DAG: #[[MAP2:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 24)>
// MATMUL-DAG: #[[MAP3:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 12)>
// MATMUL: fuse_input
// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
builtin.func @fuse_input(%arg0: tensor<24x12xf32>,
%arg1: tensor<12x25xf32>,
%arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
%c0 = arith.constant 0 : index
%c12 = arith.constant 12 : index
%c25 = arith.constant 25 : index
%c24 = arith.constant 24 : index
%c4 = arith.constant 4 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = linalg.fill(%cst, %arg0) : f32, tensor<24x12xf32> -> tensor<24x12xf32>
// MATMUL: scf.for %[[IV0:[0-9a-zA-Z]*]] =
// MATMUL: scf.for %[[IV1:[0-9a-zA-Z]*]] =
// MATMUL: %[[TS1:.*]] = affine.min #[[MAP0]](%[[IV1]])
// MATMUL: scf.for %[[IV2:[0-9a-zA-Z]*]] =
// MATMUL: %[[TS2:.*]] = affine.min #[[MAP1]](%[[IV2]])
// Tile both input operand dimensions.
// MATMUL: %[[UB1:.*]] = affine.min #[[MAP2]](%[[TS1]], %[[IV1]])
// MATMUL: %[[UB2:.*]] = affine.min #[[MAP3]](%[[TS2]], %[[IV2]])
// MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
// MATMUL-SAME: %[[IV1]], %[[IV2]]
// MATMUL-SAME: %[[UB1]], %[[UB2]]
// MATMUL: %[[T1:.*]] = linalg.fill(%{{.*}}, %[[T0]])
// MATMUL: %{{.*}} = linalg.matmul ins(%[[T1]]
%1 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
return %1 : tensor<24x25xf32>
// -----
// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (5, -d0 + 24)>
// MATMUL-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (4, -d0 + 25)>
// MATMUL: fuse_output
// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
builtin.func @fuse_output(%arg0: tensor<24x12xf32>,
%arg1: tensor<12x25xf32>,
%arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
// MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index
// MATMUL-DAG: %[[C1:.*]] = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c12 = arith.constant 12 : index
%c25 = arith.constant 25 : index
%c24 = arith.constant 24 : index
%c4 = arith.constant 4 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = linalg.fill(%cst, %arg2) : f32, tensor<24x25xf32> -> tensor<24x25xf32>
// Update the iteration argument of the outermost tile loop.
// MATMUL: scf.for %[[IV0:.*]] = {{.*}} iter_args(%[[ARG3:.*]] = %[[ARG2]]
// MATMUL: scf.for %[[IV1:.*]] = {{.*}} iter_args(%[[ARG4:.*]] = %[[ARG3]]
// MATMUL: %[[TS1:.*]] = affine.min #[[MAP0]](%[[IV1]])
// MATMUL: %[[TS0:.*]] = affine.min #[[MAP1]](%[[IV0]])
// Tile the both output operand dimensions.
// MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG4]]
// MATMUL-SAME: %[[IV1]], %[[IV0]]
// MATMUL-SAME: %[[TS1]], %[[TS0]]
// MATMUL: %[[T1:.*]] = linalg.fill(%{{.*}}, %[[T0]])
// MATMUL: scf.for %[[IV2:.*]] = {{.*}} iter_args(%[[ARG5:.*]] = %[[T1]]
// Check there is an extract/insert slice pair for the output operand.
// MATMUL-DAG: %[[D0:.*]] = tensor.dim %[[ARG5]], %[[C0]]
// MATMUL-DAG: %[[D1:.*]] = tensor.dim %[[ARG5]], %[[C1]]
// MATMUL: %[[T2:.*]] = tensor.extract_slice %[[ARG5]]
// MATMUL-SAME: 0, 0
// MATMUL-SAME: %[[D0]], %[[D1]]
// MATMUL: %[[T3:.*]] = linalg.matmul {{.*}} outs(%[[T2]]
// MATMUL: %{{.*}} = tensor.insert_slice %[[T3]] into %[[ARG5]]
// MATMUL-SAME: 0, 0
// MATMUL-SAME: %[[D0]], %[[D1]]
%1 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%0 : tensor<24x25xf32>) -> tensor<24x25xf32>
return %1 : tensor<24x25xf32>
// -----
// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (4, -d0 + 25)>
// MATMUL-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (7, -d0 + 12)>
// MATMUL-DAG: #[[MAP2:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 25)>
// MATMUL-DAG: #[[MAP3:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 12)>
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
// MATMUL: fuse_reduction
// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>
// MATMUL-SAME: %[[ARG3:[0-9a-zA-Z]*]]: tensor<12x7x25xf32>
builtin.func @fuse_reduction(%arg0: tensor<24x12xf32>,
%arg1: tensor<12x25xf32>,
%arg2: tensor<24x25xf32>,
%arg3: tensor<12x7x25xf32>) -> tensor<24x25xf32> {
%c0 = arith.constant 0 : index
%c12 = arith.constant 12 : index
%c25 = arith.constant 25 : index
%c24 = arith.constant 24 : index
%c4 = arith.constant 4 : index
%0 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg3 : tensor<12x7x25xf32>) outs(%arg1 : tensor<12x25xf32>) {
^bb0(%arg4: f32, %arg5: f32): // no predecessors
%2 = arith.addf %arg4, %arg5 : f32
linalg.yield %2 : f32
} -> tensor<12x25xf32>
// MATMUL: scf.for %[[IV0:[0-9a-zA-Z]*]] =
// MATMUL: scf.for %[[IV1:[0-9a-zA-Z]*]] =
// MATMUL: %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]])
// MATMUL: scf.for %[[IV2:[0-9a-zA-Z]*]] =
// MATMUL: %[[TS2:.*]] = affine.min #[[MAP1]](%[[IV2]])
// MATMUL: %[[UB2:.*]] = affine.min #[[MAP3]](%[[TS2]], %[[IV2]])
// MATMUL: %[[UB0:.*]] = affine.min #[[MAP2]](%[[TS0]], %[[IV0]])
// Tile only the parallel dimensions but not the reduction dimension.
// MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG3]]
// MATMUL-SAME: %[[IV2]], 0, %[[IV0]]
// MATMUL-SAME: %[[UB2]], 7, %[[UB0]]
// MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
// MATMUL-SAME: %[[IV2]], %[[IV0]]
// MATMUL-SAME: %[[UB2]], %[[UB0]]
// MATMUL: %[[T2:.*]] = linalg.generic {{.*}} ins(%[[T0]] {{.*}} outs(%[[T1]]
// MATMUL: %{{.*}} = linalg.matmul ins(%{{.*}}, %[[T2]]
%1 = linalg.matmul ins(%arg0, %0 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
return %1 : tensor<24x25xf32>
// -----
#map0 = affine_map<(d0, d1) -> (d1, d0)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
// MATMUL: fuse_transposed
// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
// MATMUL-SAME: %[[ARG3:[0-9a-zA-Z]*]]: tensor<12x24xf32>
builtin.func @fuse_transposed(%arg0: tensor<24x12xf32>,
%arg1: tensor<12x25xf32>,
%arg2: tensor<24x25xf32>,
%arg3: tensor<12x24xf32>) -> tensor<24x25xf32> {
%c0 = arith.constant 0 : index
%c12 = arith.constant 12 : index
%c25 = arith.constant 25 : index
%c24 = arith.constant 24 : index
%c4 = arith.constant 4 : index
%0 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg3 : tensor<12x24xf32>) outs(%arg0 : tensor<24x12xf32>) {
^bb0(%arg4: f32, %arg5: f32): // no predecessors
%2 = arith.addf %arg4, %arg5 : f32
linalg.yield %2 : f32
} -> tensor<24x12xf32>
// MATMUL: scf.for %[[IV0:[0-9a-zA-Z]*]] =
// MATMUL: scf.for %[[IV1:[0-9a-zA-Z]*]] =
// MATMUL: scf.for %[[IV2:[0-9a-zA-Z]*]] =
// Swap the input operand slice offsets due to the transposed indexing map.
// MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG3]]
// MATMUL-SAME: %[[IV2]], %[[IV1]]
// MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG0]]
// MATMUL-SAME: %[[IV1]], %[[IV2]]
// MATMUL: %[[T2:.*]] = linalg.generic {{.*}} ins(%[[T0]] {{.*}} outs(%[[T1]]
// MATMUL: %{{.*}} = linalg.matmul ins(%[[T2]]
%1 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
return %1 : tensor<24x25xf32>
// -----
// MATMUL: fuse_input_and_output
// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
builtin.func @fuse_input_and_output(%arg0: tensor<24x12xf32>,
%arg1: tensor<12x25xf32>,
%arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
%c0 = arith.constant 0 : index
%c12 = arith.constant 12 : index
%c25 = arith.constant 25 : index
%c24 = arith.constant 24 : index
%c4 = arith.constant 4 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = linalg.fill(%cst, %arg0) : f32, tensor<24x12xf32> -> tensor<24x12xf32>
%1 = linalg.fill(%cst, %arg2) : f32, tensor<24x25xf32> -> tensor<24x25xf32>
// Fuse both producers to the appropriate tile loops.
// MATMUL: scf.for %[[IV0:.*]] = {{.*}} iter_args(%[[ARG3:.*]] = %[[ARG2]]
// MATMUL: scf.for %[[IV1:.*]] = {{.*}} iter_args(%[[ARG4:.*]] = %[[ARG3]]
// MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG4]]
// MATMUL-SAME: %[[IV1]], %[[IV0]]
// MATMUL: %[[T1:.*]] = linalg.fill(%{{.*}}, %[[T0]])
// MATMUL: scf.for %[[IV2:.*]] = {{.*}} iter_args(%[[ARG5:.*]] = %[[T1]]
// MATMUL: %[[T2:.*]] = tensor.extract_slice %[[ARG0]]
// MATMUL-SAME: %[[IV1]], %[[IV2]]
// MATMUL: %[[T3:.*]] = linalg.fill(%{{.*}}, %[[T2]])
// MATMUL: %[[T4:.*]] = tensor.extract_slice %[[ARG5]]
// MATMUL: %{{.*}} = linalg.matmul ins(%[[T3]], {{.*}} outs(%[[T4]]
%2 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%1 : tensor<24x25xf32>) -> tensor<24x25xf32>
return %2 : tensor<24x25xf32>
// -----
// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0 + d1)>
#map0 = affine_map<(d0, d1) -> (d1, d0)>
// MATMUL: fuse_indexed
// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xi32>
builtin.func @fuse_indexed(%arg0: tensor<24x12xi32>,
%arg1: tensor<12x25xi32>,
%arg2: tensor<24x25xi32>) -> tensor<24x25xi32> {
%c0 = arith.constant 0 : index
%c12 = arith.constant 12 : index
%c25 = arith.constant 25 : index
%c24 = arith.constant 24 : index
%c4 = arith.constant 4 : index
%0 = linalg.generic {indexing_maps = [#map0], iterator_types = ["parallel", "parallel"]} outs(%arg1 : tensor<12x25xi32>) {
^bb0(%arg3: i32): // no predecessors
%6 = linalg.index 0 : index
%7 = linalg.index 1 : index
%8 = arith.addi %6, %7 : index
%9 = arith.index_cast %8 : index to i32
linalg.yield %9 : i32
} -> tensor<12x25xi32>
// MATMUL: scf.for %[[IV0:[0-9a-zA-Z]*]] =
// MATMUL: scf.for %[[IV1:[0-9a-zA-Z]*]] =
// MATMUL: scf.for %[[IV2:[0-9a-zA-Z]*]] =
// Shift the indexes by the slice offsets and swap the offsets due to the transposed indexing map.
// MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
// MATMUL-SAME: %[[IV2]], %[[IV0]]
// MATMUL: linalg.generic {{.*}} outs(%[[T1]]
// MATMUL: %[[IDX0:.*]] = linalg.index 0
// MATMUL: %[[IDX0_SHIFTED:.*]] = affine.apply #[[MAP0]](%[[IDX0]], %[[IV0]])
// MATMUL: %[[IDX1:.*]] = linalg.index 1
// MATMUL: %[[IDX1_SHIFTED:.*]] = affine.apply #[[MAP0]](%[[IDX1]], %[[IV2]])
// MATMUL: %{{.*}} = arith.addi %[[IDX0_SHIFTED]], %[[IDX1_SHIFTED]]
%1 = linalg.matmul ins(%arg0, %0 : tensor<24x12xi32>, tensor<12x25xi32>) outs(%arg2 : tensor<24x25xi32>) -> tensor<24x25xi32>
return %1 : tensor<24x25xi32>
// -----
#map0 = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0, d1) -> (d0)>
// GENERIC: fuse_outermost_reduction
// GENERIC-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<10x17xf32>
// GENERIC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<10xf32>
func @fuse_outermost_reduction(%arg0: tensor<10x17xf32>,
%arg1: tensor<10xf32>) -> tensor<10xf32> {
%cst = arith.constant 0.000000e+00 : f32
%0 = linalg.fill(%cst, %arg0) : f32, tensor<10x17xf32> -> tensor<10x17xf32>
// Cannot fuse the output fill since the reduction loop is the outermost loop.
// GENERIC: %[[T0:.*]] = linalg.fill(%{{.*}}, %[[ARG1]])
%1 = linalg.fill(%cst, %arg1) : f32, tensor<10xf32> -> tensor<10xf32>
// GENERIC: scf.for %[[IV0:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG2:.*]] = %[[T0]]
// GENERIC: scf.for %[[IV1:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG3:.*]] = %[[ARG2]]
// MATMUL the input fill has been fused.
// GENERIC: %[[T1:.*]] = tensor.extract_slice %[[ARG0]]
// GENERIC-SAME: %[[IV1]], %[[IV0]]
// GENERIC: %[[T2:.*]] = linalg.fill(%{{.*}}, %[[T1]])
// GENERIC: %[[T3:.*]] = tensor.extract_slice %[[ARG3]]
// GENERIC: linalg.generic {{.*}} ins(%[[T2]] {{.*}} outs(%[[T3]]
%2 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "reduction"]} ins(%0 : tensor<10x17xf32>) outs(%1 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%3 = arith.addf %arg2, %arg3 : f32
linalg.yield %3 : f32
} -> tensor<10xf32>
return %2 : tensor<10xf32>
// -----
// GENERIC-DAG: #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0 + d1)>
// GENERIC-DAG: #[[MAP1:.*]] = affine_map<(d0, d1) -> (8, -d0 - d1 + 17)>
// GENERIC-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (d0, -d1 - d2 + 17)>
#map0 = affine_map<(d0, d1) -> (d0, d0 + d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
// GENERIC: fuse_non_rectangular
// GENERIC-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<10x17xf32>
func @fuse_non_rectangular(%arg0: tensor<10x17xf32>,
%arg1: tensor<10x8xf32>) -> tensor<10x8xf32> {
// GENERIC-DAG: %[[C0:.*]] = arith.constant 0 : index
// GENERIC-DAG: %[[C4:.*]] = arith.constant 4 : index
// GENERIC-DAG: %[[C5:.*]] = arith.constant 5 : index
// GENERIC-DAG: %[[C8:.*]] = arith.constant 8 : index
// GENERIC-DAG: %[[C10:.*]] = arith.constant 10 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = linalg.fill(%cst, %arg0) : f32, tensor<10x17xf32> -> tensor<10x17xf32>
// GENERIC: scf.for %[[IV0:[0-9a-zA-Z]*]] = %[[C0]] to %[[C8]] step %[[C4]]
// GENERIC: scf.for %[[IV1:[0-9a-zA-Z]*]] = %[[C0]] to %[[C10]] step %[[C5]]
// Compute producer on a hyper rectangular bounding box. Along the second dimenson,
// the offset is set to the sum of the induction variables, and the upper bound
// to either 8 (tile size) or 17 (sum of max indices (9+7) then + 1) minus the
// induction variables.
// GENERIC-DAG: %[[SUM:.*]] = affine.apply #[[MAP0]](%[[IV1]], %[[IV0]]
// GENERIC-DAG: %[[TS1:.*]] = affine.min #[[MAP1]](%[[IV1]], %[[IV0]]
// GENERIC-DAG: %[[UB1:.*]] = affine.min #[[MAP2]](%[[TS1]], %[[IV1]], %[[IV0]]
// GENERIC: %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
// GENERIC-SAME: %[[IV1]], %[[SUM]]
// GENERIC-SAME: , %[[UB1]]
// GENERIC: %[[T1:.*]] = linalg.fill(%{{.*}}, %[[T0]])
%1 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x17xf32>) outs(%arg1 : tensor<10x8xf32>) {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%2 = arith.addf %arg2, %arg3 : f32
linalg.yield %2 : f32
} -> tensor<10x8xf32>
return %1 : tensor<10x8xf32>