blob: e4ee818b0879912d956901e295d10cfa5c9ea324 [file]
// RUN: mlir-opt %s --linalg-generalize-named-ops \
// RUN: --pre-sparsification-rewrite \
// RUN: --sparse-reinterpret-map \
// RUN: --sparsification="parallelization-strategy=dense-outer-loop" \
// RUN: --sparse-gpu-codegen | FileCheck %s
#CSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>
// CHECK-LABEL: func.func @tensor_add
// CHECK: %[[TENSOR_EMPTY:.*]] = tensor.empty()
// CHECK: %[[OUT_BUF:.*]] = bufferization.to_buffer %[[TENSOR_EMPTY]]
// CHECK: %[[GPU_OUT_BUF:.*]], %[[T0:.*]] = gpu.alloc async [{{.*}}] ()
// CHECK: gpu.memcpy async [%[[T0]]] %[[GPU_OUT_BUF]], %[[OUT_BUF]]
// CHECK: %[[T1:.*]] = gpu.launch_func async @sparse_kernels::@kernel0 blocks
// CHECK: %[[M0:.*]] = gpu.memcpy async [%[[T1]]] %[[OUT_BUF]], %[[GPU_OUT_BUF]]
// CHECK: gpu.dealloc async [%[[M0]]] %[[GPU_OUT_BUF]]
func.func @tensor_add(%arg0: tensor<32x32xf32, #CSR>,
%arg1: tensor<32x32xf32, #CSR>) -> tensor<32x32xf32> {
%empty = tensor.empty() : tensor<32x32xf32>
%res = linalg.generic {
indexing_maps = [
affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d0, d1)>
],
iterator_types = ["parallel", "parallel"]
} ins(%arg0, %arg1 : tensor<32x32xf32, #CSR>, tensor<32x32xf32, #CSR>)
outs(%empty : tensor<32x32xf32>) {
^bb0(%in1: f32, %in2: f32, %out: f32):
%sum = arith.addf %in1, %in2 : f32
linalg.yield %sum : f32
} -> tensor<32x32xf32>
return %res : tensor<32x32xf32>
}