mlir/test/Dialect/SparseTensor/GPU/gpu_codegen_out_buffer.mlir - llvm-project - Git at Google

 // RUN: mlir-opt %s --linalg-generalize-named-ops \
 // RUN:             --pre-sparsification-rewrite \
 // RUN:             --sparse-reinterpret-map \
 // RUN:             --sparsification="parallelization-strategy=dense-outer-loop" \
 // RUN:             --sparse-gpu-codegen | FileCheck %s

 #CSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>

 // CHECK-LABEL: func.func @tensor_add
 // CHECK:         %[[TENSOR_EMPTY:.*]] = tensor.empty()
 // CHECK:         %[[OUT_BUF:.*]] = bufferization.to_buffer %[[TENSOR_EMPTY]]
 // CHECK:         %[[GPU_OUT_BUF:.*]], %[[T0:.*]] = gpu.alloc async [{{.*}}] ()
 // CHECK:         gpu.memcpy async [%[[T0]]] %[[GPU_OUT_BUF]], %[[OUT_BUF]]
 // CHECK:         %[[T1:.*]] = gpu.launch_func async @sparse_kernels::@kernel0 blocks
 // CHECK:         %[[M0:.*]] = gpu.memcpy async [%[[T1]]] %[[OUT_BUF]], %[[GPU_OUT_BUF]]
 // CHECK:         gpu.dealloc async [%[[M0]]] %[[GPU_OUT_BUF]]

 func.func @tensor_add(%arg0: tensor<32x32xf32, #CSR>,
                       %arg1: tensor<32x32xf32, #CSR>) -> tensor<32x32xf32> {
   %empty = tensor.empty() : tensor<32x32xf32>
   %res = linalg.generic {
     indexing_maps = [
       affine_map<(d0, d1) -> (d0, d1)>,
       affine_map<(d0, d1) -> (d0, d1)>,
       affine_map<(d0, d1) -> (d0, d1)>
     ],
     iterator_types = ["parallel", "parallel"]
   } ins(%arg0, %arg1 : tensor<32x32xf32, #CSR>, tensor<32x32xf32, #CSR>)
     outs(%empty : tensor<32x32xf32>) {
   ^bb0(%in1: f32, %in2: f32, %out: f32):
     %sum = arith.addf %in1, %in2 : f32
     linalg.yield %sum : f32
   } -> tensor<32x32xf32>
   return %res : tensor<32x32xf32>
 }
	// RUN: mlir-opt %s --linalg-generalize-named-ops \
	// RUN: --pre-sparsification-rewrite \
	// RUN: --sparse-reinterpret-map \
	// RUN: --sparsification="parallelization-strategy=dense-outer-loop" \
	// RUN: --sparse-gpu-codegen \| FileCheck %s

	#CSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>

	// CHECK-LABEL: func.func @tensor_add
	// CHECK: %[[TENSOR_EMPTY:.*]] = tensor.empty()
	// CHECK: %[[OUT_BUF:.*]] = bufferization.to_buffer %[[TENSOR_EMPTY]]
	// CHECK: %[[GPU_OUT_BUF:.]], %[[T0:.]] = gpu.alloc async [{{.*}}] ()
	// CHECK: gpu.memcpy async [%[[T0]]] %[[GPU_OUT_BUF]], %[[OUT_BUF]]
	// CHECK: %[[T1:.*]] = gpu.launch_func async @sparse_kernels::@kernel0 blocks
	// CHECK: %[[M0:.*]] = gpu.memcpy async [%[[T1]]] %[[OUT_BUF]], %[[GPU_OUT_BUF]]
	// CHECK: gpu.dealloc async [%[[M0]]] %[[GPU_OUT_BUF]]

	func.func @tensor_add(%arg0: tensor<32x32xf32, #CSR>,
	%arg1: tensor<32x32xf32, #CSR>) -> tensor<32x32xf32> {
	%empty = tensor.empty() : tensor<32x32xf32>
	%res = linalg.generic {
	indexing_maps = [
	affine_map<(d0, d1) -> (d0, d1)>,
	affine_map<(d0, d1) -> (d0, d1)>,
	affine_map<(d0, d1) -> (d0, d1)>
	],
	iterator_types = ["parallel", "parallel"]
	} ins(%arg0, %arg1 : tensor<32x32xf32, #CSR>, tensor<32x32xf32, #CSR>)
	outs(%empty : tensor<32x32xf32>) {
	^bb0(%in1: f32, %in2: f32, %out: f32):
	%sum = arith.addf %in1, %in2 : f32
	linalg.yield %sum : f32
	} -> tensor<32x32xf32>
	return %res : tensor<32x32xf32>
	}