mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir - llvm-project - Git at Google

 // RUN: export M=24 && export K=64 && export N=192 && export ITERS=10 && \
 // RUN: cat %s | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g'| \
 // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \
 // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.fill register-tile-sizes=4,32 vectorize" | \
 // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.copy register-tile-sizes=4,32 vectorize" | \

 // RUN: mlir-opt -canonicalize -convert-vector-to-scf -lower-affine -convert-linalg-to-loops | \
 // RUN: mlir-opt -canonicalize -convert-scf-to-std -convert-vector-to-llvm -convert-memref-to-llvm -convert-std-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
 // Activate to dump assembly
 // R_UN:   -dump-object-file -object-filename=/tmp/a.o \
 // RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
 // RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // Use tee to both print to stderr and FileCheck
 // RUN: tee -a /dev/stderr | FileCheck %s


 !elem_type_a = type f32
 !elem_type_b = type f32
 !elem_type_c = type f32
 !row_major_A = type memref<${M}x${K}x!elem_type_a>
 !row_major_B = type memref<${K}x${N}x!elem_type_b>
 !row_major_C = type memref<${M}x${N}x!elem_type_c>

 func @matmul(%a: !row_major_A, %b: !row_major_B, %c: !row_major_C)
 // TODO: activate manually for now.
 // attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]}
 {
   linalg.matmul ins(%a, %b : !row_major_A, !row_major_B)
     outs(%c: !row_major_C)
   return
 }

 func @print_perf(%iters: index, %total_time: f64) {
   %c2 = arith.constant 2 : index
   %cM = arith.constant ${M} : index
   %cN = arith.constant ${N} : index
   %cK = arith.constant ${K} : index

   %mn = arith.muli %cM, %cN : index
   %mnk = arith.muli %mn, %cK : index

   // 2*M*N*K.
   %flops_per_iter = arith.muli %c2, %mnk : index
   %flops = arith.muli %iters, %flops_per_iter : index
   %flops_i64 = arith.index_cast %flops : index to i64
   %flops_f = arith.sitofp %flops_i64 : i64 to f64
   %flops_per_s = arith.divf %flops_f, %total_time : f64
   vector.print %flops_per_s : f64

   return
 }

 func @main() {
   %v0 = arith.constant 0.0 : !elem_type_a
   %v1 = arith.constant 1.0 : !elem_type_a

   %A = memref.alloc() : !row_major_A
   %B = memref.alloc() : !row_major_B
   %C = memref.alloc() : !row_major_C

   linalg.fill(%v1, %A) : !elem_type_a, !row_major_A
   linalg.fill(%v1, %B) : !elem_type_b, !row_major_B
   linalg.fill(%v0, %C) : !elem_type_c, !row_major_C

   %c0 = arith.constant 0: index
   %c1 = arith.constant 1: index
   %iters = arith.constant ${ITERS}: index

   /// Run and dump performance for matmul.
   /// Preheating run:
   scf.for %arg0 = %c0 to %iters step %c1 {
     %z = arith.constant 0.0 : !elem_type_c
     linalg.fill(%z, %C) : !elem_type_c, !row_major_C
     call @matmul(%A, %B, %C) : (!row_major_A, !row_major_B, !row_major_C) -> ()
   }
   %t_start_matmul = call @rtclock() : () -> f64
   scf.for %arg0 = %c0 to %iters step %c1 {
     // linalg.matmul writes %C in place, need to reset it to zero every time.
     // This is accounts for about 10-15% perf hit on small sizes.
     // Once linalg on tensors is ready, fusing fill at the register level will
     // be easy.
     %z = arith.constant 0.0 : !elem_type_c
     linalg.fill(%z, %C) : !elem_type_c, !row_major_C
     call @matmul(%A, %B, %C) : (!row_major_A, !row_major_B, !row_major_C) -> ()
   }
   %t_end_matmul = call @rtclock() : () -> f64
   %tmatmul = arith.subf %t_end_matmul, %t_start_matmul: f64
   call @print_perf(%iters, %tmatmul) : (index, f64) -> ()

   // CHECK: {{^0$}}
   %C_ref = memref.alloc() : !row_major_C
   linalg.fill(%v0, %C_ref) : !elem_type_c, !row_major_C
   linalg.matmul ins(%A, %B : !row_major_A, !row_major_B)
     outs(%C_ref: !row_major_C)
   %act = memref.cast %C : !row_major_C to memref<*xf32>
   %exp = memref.cast %C_ref : !row_major_C to memref<*xf32>
   %errors = call @verifyMemRefF32(%act, %exp) : (memref<*xf32>, memref<*xf32>) -> i64
   vector.print %errors : i64
   memref.dealloc %C_ref : !row_major_C

   memref.dealloc %A : !row_major_A
   memref.dealloc %B : !row_major_B
   memref.dealloc %C : !row_major_C

   return
 }

 func private @rtclock() -> f64
 func private @verifyMemRefF32(memref<*xf32>, memref<*xf32>) -> i64 attributes { llvm.emit_c_interface }

 // TODO: init with random, run and check output.
 // func private @fill_random_f32(memref<*xf32>)
	// RUN: export M=24 && export K=64 && export N=192 && export ITERS=10 && \
	// RUN: cat %s \| sed 's@${M}@'"$M"'@g'\| sed 's@${K}@'"$K"'@g' \| sed 's@${N}@'"$N"'@g'\| sed 's@${ITERS}@'"$ITERS"'@g'\| \
	// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" \| \
	// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.fill register-tile-sizes=4,32 vectorize" \| \
	// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.copy register-tile-sizes=4,32 vectorize" \| \

	// RUN: mlir-opt -canonicalize -convert-vector-to-scf -lower-affine -convert-linalg-to-loops \| \
	// RUN: mlir-opt -canonicalize -convert-scf-to-std -convert-vector-to-llvm -convert-memref-to-llvm -convert-std-to-llvm -reconcile-unrealized-casts \| \
	// RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
	// Activate to dump assembly
	// R_UN: -dump-object-file -object-filename=/tmp/a.o \
	// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
	// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext \| \
	// Use tee to both print to stderr and FileCheck
	// RUN: tee -a /dev/stderr \| FileCheck %s


	!elem_type_a = type f32
	!elem_type_b = type f32
	!elem_type_c = type f32
	!row_major_A = type memref<${M}x${K}x!elem_type_a>
	!row_major_B = type memref<${K}x${N}x!elem_type_b>
	!row_major_C = type memref<${M}x${N}x!elem_type_c>

	func @matmul(%a: !row_major_A, %b: !row_major_B, %c: !row_major_C)
	// TODO: activate manually for now.
	// attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]}
	{
	linalg.matmul ins(%a, %b : !row_major_A, !row_major_B)
	outs(%c: !row_major_C)
	return
	}

	func @print_perf(%iters: index, %total_time: f64) {
	%c2 = arith.constant 2 : index
	%cM = arith.constant ${M} : index
	%cN = arith.constant ${N} : index
	%cK = arith.constant ${K} : index

	%mn = arith.muli %cM, %cN : index
	%mnk = arith.muli %mn, %cK : index

	// 2MN*K.
	%flops_per_iter = arith.muli %c2, %mnk : index
	%flops = arith.muli %iters, %flops_per_iter : index
	%flops_i64 = arith.index_cast %flops : index to i64
	%flops_f = arith.sitofp %flops_i64 : i64 to f64
	%flops_per_s = arith.divf %flops_f, %total_time : f64
	vector.print %flops_per_s : f64

	return
	}

	func @main() {
	%v0 = arith.constant 0.0 : !elem_type_a
	%v1 = arith.constant 1.0 : !elem_type_a

	%A = memref.alloc() : !row_major_A
	%B = memref.alloc() : !row_major_B
	%C = memref.alloc() : !row_major_C

	linalg.fill(%v1, %A) : !elem_type_a, !row_major_A
	linalg.fill(%v1, %B) : !elem_type_b, !row_major_B
	linalg.fill(%v0, %C) : !elem_type_c, !row_major_C

	%c0 = arith.constant 0: index
	%c1 = arith.constant 1: index
	%iters = arith.constant ${ITERS}: index

	/// Run and dump performance for matmul.
	/// Preheating run:
	scf.for %arg0 = %c0 to %iters step %c1 {
	%z = arith.constant 0.0 : !elem_type_c
	linalg.fill(%z, %C) : !elem_type_c, !row_major_C
	call @matmul(%A, %B, %C) : (!row_major_A, !row_major_B, !row_major_C) -> ()
	}
	%t_start_matmul = call @rtclock() : () -> f64
	scf.for %arg0 = %c0 to %iters step %c1 {
	// linalg.matmul writes %C in place, need to reset it to zero every time.
	// This is accounts for about 10-15% perf hit on small sizes.
	// Once linalg on tensors is ready, fusing fill at the register level will
	// be easy.
	%z = arith.constant 0.0 : !elem_type_c
	linalg.fill(%z, %C) : !elem_type_c, !row_major_C
	call @matmul(%A, %B, %C) : (!row_major_A, !row_major_B, !row_major_C) -> ()
	}
	%t_end_matmul = call @rtclock() : () -> f64
	%tmatmul = arith.subf %t_end_matmul, %t_start_matmul: f64
	call @print_perf(%iters, %tmatmul) : (index, f64) -> ()

	// CHECK: {{^0$}}
	%C_ref = memref.alloc() : !row_major_C
	linalg.fill(%v0, %C_ref) : !elem_type_c, !row_major_C
	linalg.matmul ins(%A, %B : !row_major_A, !row_major_B)
	outs(%C_ref: !row_major_C)
	%act = memref.cast %C : !row_major_C to memref<*xf32>
	%exp = memref.cast %C_ref : !row_major_C to memref<*xf32>
	%errors = call @verifyMemRefF32(%act, %exp) : (memref<xf32>, memref<xf32>) -> i64
	vector.print %errors : i64
	memref.dealloc %C_ref : !row_major_C

	memref.dealloc %A : !row_major_A
	memref.dealloc %B : !row_major_B
	memref.dealloc %C : !row_major_C

	return
	}

	func private @rtclock() -> f64
	func private @verifyMemRefF32(memref<xf32>, memref<xf32>) -> i64 attributes { llvm.emit_c_interface }

	// TODO: init with random, run and check output.
	// func private @fill_random_f32(memref<*xf32>)