mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir - llvm-project - Git at Google

 // NOTE: this test requires gpu-sm80
 //
 // DEFINE: %{compile} = mlir-opt %s \
 // DEFINE:   --sparsifier="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format
 // DEFINE: %{run} = mlir-cpu-runner \
 // DEFINE:   --shared-libs=%mlir_cuda_runtime \
 // DEFINE:   --shared-libs=%mlir_c_runner_utils \
 // DEFINE:   --e main --entry-point-result=void \
 // DEFINE: | FileCheck %s
 //
 // with RT lib (SoA COO):
 //
 // RUN: %{compile} enable-runtime-library=true"  | %{run}
 //
 // without RT lib (AoS COO): note, may fall back to CPU
 //
 // RUN: %{compile} enable-runtime-library=false" | %{run}

 #SortedCOO = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton)
 }>

 #CSR = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : dense, d1 : compressed),
   posWidth = 32,
   crdWidth = 32
 }>

 #CSC = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d1 : dense, d0 : compressed),
   posWidth = 64,
   crdWidth = 64
 }>

 module {
   llvm.func @mgpuCreateSparseEnv()
   llvm.func @mgpuDestroySparseEnv()

   // Computes C = A x B with A sparse COO.
   func.func @matmulCOO(%A: tensor<8x8xf32, #SortedCOO>,
                        %B: tensor<8x8xf32>,
                        %C: tensor<8x8xf32>) -> tensor<8x8xf32> {
     %D = linalg.matmul
       ins(%A, %B: tensor<8x8xf32, #SortedCOO>, tensor<8x8xf32>)
       outs(%C: tensor<8x8xf32>) -> tensor<8x8xf32>
     return %D: tensor<8x8xf32>
   }

   // Computes C = A x B with A sparse CSR.
   func.func @matmulCSR(%A: tensor<8x8xf32, #CSR>,
                        %B: tensor<8x8xf32>,
                        %C: tensor<8x8xf32>) -> tensor<8x8xf32> {
     %D = linalg.matmul
       ins(%A, %B: tensor<8x8xf32, #CSR>, tensor<8x8xf32>)
       outs(%C: tensor<8x8xf32>) -> tensor<8x8xf32>
     return %D: tensor<8x8xf32>
   }

   // Computes C = A x B with A sparse CSC.
   func.func @matmulCSC(%A: tensor<8x8xf32, #CSC>,
                        %B: tensor<8x8xf32>,
                        %C: tensor<8x8xf32>) -> tensor<8x8xf32> {
     %D = linalg.matmul
       ins(%A, %B: tensor<8x8xf32, #CSC>, tensor<8x8xf32>)
       outs(%C: tensor<8x8xf32>) -> tensor<8x8xf32>
     return %D: tensor<8x8xf32>
   }

   // Helper to dump dense tensor as series of vectors.
   func.func @dump(%mat: tensor<8x8xf32>) {
     %f0 = arith.constant 0.0 : f32
     %c0 = arith.constant 0   : index
     %c1 = arith.constant 1   : index
     %c8 = arith.constant 8   : index
     scf.for %i = %c0 to %c8 step %c1 {
       %v = vector.transfer_read %mat[%i,%c0], %f0 : tensor<8x8xf32>, vector<8xf32>
       vector.print %v : vector<8xf32>
     }
     return
   }

   //
   // Main driver.
   //
   func.func @main() {
     llvm.call @mgpuCreateSparseEnv(): () -> ()
     %f0 = arith.constant 0.0 : f32
     %f1 = arith.constant 1.0 : f32

     // Stress test with a dense matrix DA.
     %DA = tensor.generate {
     ^bb0(%i: index, %j: index):
       %k = arith.addi %i, %j : index
       %l = arith.index_cast %k : index to i64
       %f = arith.uitofp %l : i64 to f32
       tensor.yield %f : f32
     } : tensor<8x8xf32>

     // Convert to a "sparse" matrix A.
     %Acoo = sparse_tensor.convert %DA : tensor<8x8xf32> to tensor<8x8xf32, #SortedCOO>
     %Acsr = sparse_tensor.convert %DA : tensor<8x8xf32> to tensor<8x8xf32, #CSR>
     %Acsc = sparse_tensor.convert %DA : tensor<8x8xf32> to tensor<8x8xf32, #CSC>

     // Initial C matrices.
     %C0 = tensor.generate {
     ^bb0(%i: index, %j: index):
       tensor.yield %f0 : f32
     } : tensor<8x8xf32>
     %C1 = tensor.generate {
     ^bb0(%i: index, %j: index):
       tensor.yield %f1 : f32
     } : tensor<8x8xf32>

      // Call the kernels.
     %0 = call @matmulCOO(%Acoo, %DA, %C0) : (tensor<8x8xf32, #SortedCOO>,
                                              tensor<8x8xf32>,
 					     tensor<8x8xf32>) -> tensor<8x8xf32>
     %1 = call @matmulCSR(%Acsr, %DA, %C0) : (tensor<8x8xf32, #CSR>,
                                              tensor<8x8xf32>,
 					     tensor<8x8xf32>) -> tensor<8x8xf32>
     %2 = call @matmulCSC(%Acsc, %DA, %C0) : (tensor<8x8xf32, #CSC>,
                                              tensor<8x8xf32>,
 					     tensor<8x8xf32>) -> tensor<8x8xf32>
     %3 = call @matmulCOO(%Acoo, %DA, %C1) : (tensor<8x8xf32, #SortedCOO>,
                                              tensor<8x8xf32>,
 					     tensor<8x8xf32>) -> tensor<8x8xf32>
     %4 = call @matmulCSR(%Acsr, %DA, %C1) : (tensor<8x8xf32, #CSR>,
                                              tensor<8x8xf32>,
 					     tensor<8x8xf32>) -> tensor<8x8xf32>
     %5 = call @matmulCSC(%Acsc, %DA, %C1) : (tensor<8x8xf32, #CSC>,
                                              tensor<8x8xf32>,
 					     tensor<8x8xf32>) -> tensor<8x8xf32>

     //
     // Sanity check on results.
     //
     // CHECK:      ( 140, 168, 196, 224, 252, 280, 308, 336 )
     // CHECK-NEXT: ( 168, 204, 240, 276, 312, 348, 384, 420 )
     // CHECK-NEXT: ( 196, 240, 284, 328, 372, 416, 460, 504 )
     // CHECK-NEXT: ( 224, 276, 328, 380, 432, 484, 536, 588 )
     // CHECK-NEXT: ( 252, 312, 372, 432, 492, 552, 612, 672 )
     // CHECK-NEXT: ( 280, 348, 416, 484, 552, 620, 688, 756 )
     // CHECK-NEXT: ( 308, 384, 460, 536, 612, 688, 764, 840 )
     // CHECK-NEXT: ( 336, 420, 504, 588, 672, 756, 840, 924 )
     //
     // CHECK:      ( 140, 168, 196, 224, 252, 280, 308, 336 )
     // CHECK-NEXT: ( 168, 204, 240, 276, 312, 348, 384, 420 )
     // CHECK-NEXT: ( 196, 240, 284, 328, 372, 416, 460, 504 )
     // CHECK-NEXT: ( 224, 276, 328, 380, 432, 484, 536, 588 )
     // CHECK-NEXT: ( 252, 312, 372, 432, 492, 552, 612, 672 )
     // CHECK-NEXT: ( 280, 348, 416, 484, 552, 620, 688, 756 )
     // CHECK-NEXT: ( 308, 384, 460, 536, 612, 688, 764, 840 )
     // CHECK-NEXT: ( 336, 420, 504, 588, 672, 756, 840, 924 )
     //
     // CHECK:      ( 140, 168, 196, 224, 252, 280, 308, 336 )
     // CHECK-NEXT: ( 168, 204, 240, 276, 312, 348, 384, 420 )
     // CHECK-NEXT: ( 196, 240, 284, 328, 372, 416, 460, 504 )
     // CHECK-NEXT: ( 224, 276, 328, 380, 432, 484, 536, 588 )
     // CHECK-NEXT: ( 252, 312, 372, 432, 492, 552, 612, 672 )
     // CHECK-NEXT: ( 280, 348, 416, 484, 552, 620, 688, 756 )
     // CHECK-NEXT: ( 308, 384, 460, 536, 612, 688, 764, 840 )
     // CHECK-NEXT: ( 336, 420, 504, 588, 672, 756, 840, 924 )
     //
     // CHECK:      ( 141, 169, 197, 225, 253, 281, 309, 337 )
     // CHECK-NEXT: ( 169, 205, 241, 277, 313, 349, 385, 421 )
     // CHECK-NEXT: ( 197, 241, 285, 329, 373, 417, 461, 505 )
     // CHECK-NEXT: ( 225, 277, 329, 381, 433, 485, 537, 589 )
     // CHECK-NEXT: ( 253, 313, 373, 433, 493, 553, 613, 673 )
     // CHECK-NEXT: ( 281, 349, 417, 485, 553, 621, 689, 757 )
     // CHECK-NEXT: ( 309, 385, 461, 537, 613, 689, 765, 841 )
     // CHECK-NEXT: ( 337, 421, 505, 589, 673, 757, 841, 925 )
     //
     // CHECK:      ( 141, 169, 197, 225, 253, 281, 309, 337 )
     // CHECK-NEXT: ( 169, 205, 241, 277, 313, 349, 385, 421 )
     // CHECK-NEXT: ( 197, 241, 285, 329, 373, 417, 461, 505 )
     // CHECK-NEXT: ( 225, 277, 329, 381, 433, 485, 537, 589 )
     // CHECK-NEXT: ( 253, 313, 373, 433, 493, 553, 613, 673 )
     // CHECK-NEXT: ( 281, 349, 417, 485, 553, 621, 689, 757 )
     // CHECK-NEXT: ( 309, 385, 461, 537, 613, 689, 765, 841 )
     // CHECK-NEXT: ( 337, 421, 505, 589, 673, 757, 841, 925 )
     //
     // CHECK:      ( 141, 169, 197, 225, 253, 281, 309, 337 )
     // CHECK-NEXT: ( 169, 205, 241, 277, 313, 349, 385, 421 )
     // CHECK-NEXT: ( 197, 241, 285, 329, 373, 417, 461, 505 )
     // CHECK-NEXT: ( 225, 277, 329, 381, 433, 485, 537, 589 )
     // CHECK-NEXT: ( 253, 313, 373, 433, 493, 553, 613, 673 )
     // CHECK-NEXT: ( 281, 349, 417, 485, 553, 621, 689, 757 )
     // CHECK-NEXT: ( 309, 385, 461, 537, 613, 689, 765, 841 )
     // CHECK-NEXT: ( 337, 421, 505, 589, 673, 757, 841, 925 )
     //
     call @dump(%0) : (tensor<8x8xf32>) -> ()
     call @dump(%1) : (tensor<8x8xf32>) -> ()
     call @dump(%2) : (tensor<8x8xf32>) -> ()
     call @dump(%3) : (tensor<8x8xf32>) -> ()
     call @dump(%4) : (tensor<8x8xf32>) -> ()
     call @dump(%5) : (tensor<8x8xf32>) -> ()

     // Release the resources.
     bufferization.dealloc_tensor %Acoo : tensor<8x8xf32, #SortedCOO>
     bufferization.dealloc_tensor %Acsr : tensor<8x8xf32, #CSR>
     bufferization.dealloc_tensor %Acsc : tensor<8x8xf32, #CSC>

     llvm.call @mgpuDestroySparseEnv(): () -> ()

     return
   }
 }
	// NOTE: this test requires gpu-sm80
	//
	// DEFINE: %{compile} = mlir-opt %s \
	// DEFINE: --sparsifier="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format
	// DEFINE: %{run} = mlir-cpu-runner \
	// DEFINE: --shared-libs=%mlir_cuda_runtime \
	// DEFINE: --shared-libs=%mlir_c_runner_utils \
	// DEFINE: --e main --entry-point-result=void \
	// DEFINE: \| FileCheck %s
	//
	// with RT lib (SoA COO):
	//
	// RUN: %{compile} enable-runtime-library=true" \| %{run}
	//
	// without RT lib (AoS COO): note, may fall back to CPU
	//
	// RUN: %{compile} enable-runtime-library=false" \| %{run}

	#SortedCOO = #sparse_tensor.encoding<{
	map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton)
	}>

	#CSR = #sparse_tensor.encoding<{
	map = (d0, d1) -> (d0 : dense, d1 : compressed),
	posWidth = 32,
	crdWidth = 32
	}>

	#CSC = #sparse_tensor.encoding<{
	map = (d0, d1) -> (d1 : dense, d0 : compressed),
	posWidth = 64,
	crdWidth = 64
	}>

	module {
	llvm.func @mgpuCreateSparseEnv()
	llvm.func @mgpuDestroySparseEnv()

	// Computes C = A x B with A sparse COO.
	func.func @matmulCOO(%A: tensor<8x8xf32, #SortedCOO>,
	%B: tensor<8x8xf32>,
	%C: tensor<8x8xf32>) -> tensor<8x8xf32> {
	%D = linalg.matmul
	ins(%A, %B: tensor<8x8xf32, #SortedCOO>, tensor<8x8xf32>)
	outs(%C: tensor<8x8xf32>) -> tensor<8x8xf32>
	return %D: tensor<8x8xf32>
	}

	// Computes C = A x B with A sparse CSR.
	func.func @matmulCSR(%A: tensor<8x8xf32, #CSR>,
	%B: tensor<8x8xf32>,
	%C: tensor<8x8xf32>) -> tensor<8x8xf32> {
	%D = linalg.matmul
	ins(%A, %B: tensor<8x8xf32, #CSR>, tensor<8x8xf32>)
	outs(%C: tensor<8x8xf32>) -> tensor<8x8xf32>
	return %D: tensor<8x8xf32>
	}

	// Computes C = A x B with A sparse CSC.
	func.func @matmulCSC(%A: tensor<8x8xf32, #CSC>,
	%B: tensor<8x8xf32>,
	%C: tensor<8x8xf32>) -> tensor<8x8xf32> {
	%D = linalg.matmul
	ins(%A, %B: tensor<8x8xf32, #CSC>, tensor<8x8xf32>)
	outs(%C: tensor<8x8xf32>) -> tensor<8x8xf32>
	return %D: tensor<8x8xf32>
	}

	// Helper to dump dense tensor as series of vectors.
	func.func @dump(%mat: tensor<8x8xf32>) {
	%f0 = arith.constant 0.0 : f32
	%c0 = arith.constant 0 : index
	%c1 = arith.constant 1 : index
	%c8 = arith.constant 8 : index
	scf.for %i = %c0 to %c8 step %c1 {
	%v = vector.transfer_read %mat[%i,%c0], %f0 : tensor<8x8xf32>, vector<8xf32>
	vector.print %v : vector<8xf32>
	}
	return
	}

	//
	// Main driver.
	//
	func.func @main() {
	llvm.call @mgpuCreateSparseEnv(): () -> ()
	%f0 = arith.constant 0.0 : f32
	%f1 = arith.constant 1.0 : f32

	// Stress test with a dense matrix DA.
	%DA = tensor.generate {
	^bb0(%i: index, %j: index):
	%k = arith.addi %i, %j : index
	%l = arith.index_cast %k : index to i64
	%f = arith.uitofp %l : i64 to f32
	tensor.yield %f : f32
	} : tensor<8x8xf32>

	// Convert to a "sparse" matrix A.
	%Acoo = sparse_tensor.convert %DA : tensor<8x8xf32> to tensor<8x8xf32, #SortedCOO>
	%Acsr = sparse_tensor.convert %DA : tensor<8x8xf32> to tensor<8x8xf32, #CSR>
	%Acsc = sparse_tensor.convert %DA : tensor<8x8xf32> to tensor<8x8xf32, #CSC>

	// Initial C matrices.
	%C0 = tensor.generate {
	^bb0(%i: index, %j: index):
	tensor.yield %f0 : f32
	} : tensor<8x8xf32>
	%C1 = tensor.generate {
	^bb0(%i: index, %j: index):
	tensor.yield %f1 : f32
	} : tensor<8x8xf32>

	// Call the kernels.
	%0 = call @matmulCOO(%Acoo, %DA, %C0) : (tensor<8x8xf32, #SortedCOO>,
	tensor<8x8xf32>,
	tensor<8x8xf32>) -> tensor<8x8xf32>
	%1 = call @matmulCSR(%Acsr, %DA, %C0) : (tensor<8x8xf32, #CSR>,
	tensor<8x8xf32>,
	tensor<8x8xf32>) -> tensor<8x8xf32>
	%2 = call @matmulCSC(%Acsc, %DA, %C0) : (tensor<8x8xf32, #CSC>,
	tensor<8x8xf32>,
	tensor<8x8xf32>) -> tensor<8x8xf32>
	%3 = call @matmulCOO(%Acoo, %DA, %C1) : (tensor<8x8xf32, #SortedCOO>,
	tensor<8x8xf32>,
	tensor<8x8xf32>) -> tensor<8x8xf32>
	%4 = call @matmulCSR(%Acsr, %DA, %C1) : (tensor<8x8xf32, #CSR>,
	tensor<8x8xf32>,
	tensor<8x8xf32>) -> tensor<8x8xf32>
	%5 = call @matmulCSC(%Acsc, %DA, %C1) : (tensor<8x8xf32, #CSC>,
	tensor<8x8xf32>,
	tensor<8x8xf32>) -> tensor<8x8xf32>

	//
	// Sanity check on results.
	//
	// CHECK: ( 140, 168, 196, 224, 252, 280, 308, 336 )
	// CHECK-NEXT: ( 168, 204, 240, 276, 312, 348, 384, 420 )
	// CHECK-NEXT: ( 196, 240, 284, 328, 372, 416, 460, 504 )
	// CHECK-NEXT: ( 224, 276, 328, 380, 432, 484, 536, 588 )
	// CHECK-NEXT: ( 252, 312, 372, 432, 492, 552, 612, 672 )
	// CHECK-NEXT: ( 280, 348, 416, 484, 552, 620, 688, 756 )
	// CHECK-NEXT: ( 308, 384, 460, 536, 612, 688, 764, 840 )
	// CHECK-NEXT: ( 336, 420, 504, 588, 672, 756, 840, 924 )
	//
	// CHECK: ( 140, 168, 196, 224, 252, 280, 308, 336 )
	// CHECK-NEXT: ( 168, 204, 240, 276, 312, 348, 384, 420 )
	// CHECK-NEXT: ( 196, 240, 284, 328, 372, 416, 460, 504 )
	// CHECK-NEXT: ( 224, 276, 328, 380, 432, 484, 536, 588 )
	// CHECK-NEXT: ( 252, 312, 372, 432, 492, 552, 612, 672 )
	// CHECK-NEXT: ( 280, 348, 416, 484, 552, 620, 688, 756 )
	// CHECK-NEXT: ( 308, 384, 460, 536, 612, 688, 764, 840 )
	// CHECK-NEXT: ( 336, 420, 504, 588, 672, 756, 840, 924 )
	//
	// CHECK: ( 140, 168, 196, 224, 252, 280, 308, 336 )
	// CHECK-NEXT: ( 168, 204, 240, 276, 312, 348, 384, 420 )
	// CHECK-NEXT: ( 196, 240, 284, 328, 372, 416, 460, 504 )
	// CHECK-NEXT: ( 224, 276, 328, 380, 432, 484, 536, 588 )
	// CHECK-NEXT: ( 252, 312, 372, 432, 492, 552, 612, 672 )
	// CHECK-NEXT: ( 280, 348, 416, 484, 552, 620, 688, 756 )
	// CHECK-NEXT: ( 308, 384, 460, 536, 612, 688, 764, 840 )
	// CHECK-NEXT: ( 336, 420, 504, 588, 672, 756, 840, 924 )
	//
	// CHECK: ( 141, 169, 197, 225, 253, 281, 309, 337 )
	// CHECK-NEXT: ( 169, 205, 241, 277, 313, 349, 385, 421 )
	// CHECK-NEXT: ( 197, 241, 285, 329, 373, 417, 461, 505 )
	// CHECK-NEXT: ( 225, 277, 329, 381, 433, 485, 537, 589 )
	// CHECK-NEXT: ( 253, 313, 373, 433, 493, 553, 613, 673 )
	// CHECK-NEXT: ( 281, 349, 417, 485, 553, 621, 689, 757 )
	// CHECK-NEXT: ( 309, 385, 461, 537, 613, 689, 765, 841 )
	// CHECK-NEXT: ( 337, 421, 505, 589, 673, 757, 841, 925 )
	//
	// CHECK: ( 141, 169, 197, 225, 253, 281, 309, 337 )
	// CHECK-NEXT: ( 169, 205, 241, 277, 313, 349, 385, 421 )
	// CHECK-NEXT: ( 197, 241, 285, 329, 373, 417, 461, 505 )
	// CHECK-NEXT: ( 225, 277, 329, 381, 433, 485, 537, 589 )
	// CHECK-NEXT: ( 253, 313, 373, 433, 493, 553, 613, 673 )
	// CHECK-NEXT: ( 281, 349, 417, 485, 553, 621, 689, 757 )
	// CHECK-NEXT: ( 309, 385, 461, 537, 613, 689, 765, 841 )
	// CHECK-NEXT: ( 337, 421, 505, 589, 673, 757, 841, 925 )
	//
	// CHECK: ( 141, 169, 197, 225, 253, 281, 309, 337 )
	// CHECK-NEXT: ( 169, 205, 241, 277, 313, 349, 385, 421 )
	// CHECK-NEXT: ( 197, 241, 285, 329, 373, 417, 461, 505 )
	// CHECK-NEXT: ( 225, 277, 329, 381, 433, 485, 537, 589 )
	// CHECK-NEXT: ( 253, 313, 373, 433, 493, 553, 613, 673 )
	// CHECK-NEXT: ( 281, 349, 417, 485, 553, 621, 689, 757 )
	// CHECK-NEXT: ( 309, 385, 461, 537, 613, 689, 765, 841 )
	// CHECK-NEXT: ( 337, 421, 505, 589, 673, 757, 841, 925 )
	//
	call @dump(%0) : (tensor<8x8xf32>) -> ()
	call @dump(%1) : (tensor<8x8xf32>) -> ()
	call @dump(%2) : (tensor<8x8xf32>) -> ()
	call @dump(%3) : (tensor<8x8xf32>) -> ()
	call @dump(%4) : (tensor<8x8xf32>) -> ()
	call @dump(%5) : (tensor<8x8xf32>) -> ()

	// Release the resources.
	bufferization.dealloc_tensor %Acoo : tensor<8x8xf32, #SortedCOO>
	bufferization.dealloc_tensor %Acsr : tensor<8x8xf32, #CSR>
	bufferization.dealloc_tensor %Acsc : tensor<8x8xf32, #CSC>

	llvm.call @mgpuDestroySparseEnv(): () -> ()

	return
	}
	}