test/Dialect/NVGPU/transform-pipeline-shared.mlir - llvm-project/mlir - Git at Google

 // RUN: mlir-opt %s --transform-interpreter -canonicalize --split-input-file --verify-diagnostics | FileCheck %s

 func.func @simple_depth_2_unpeeled(%global: memref<?xf32>, %result: memref<?xf32> ) {
   %c0 = arith.constant 0 : index
   %c100 = arith.constant 100 : index
   %c4 = arith.constant 4 : index
   %shared = memref.alloc(%c100) : memref<?xf32, #gpu.address_space<workgroup>>
   %c0f = arith.constant 0.0 : f32
   // Predication is not currently implemented for transfer_read/write, so this is expected to fail.
   // expected-note @below {{couldn't predicate}}
   scf.for %i = %c0 to %c100 step %c4 iter_args(%accum = %c0f) -> f32 {
     %mem = vector.transfer_read %global[%i], %c0f : memref<?xf32>, vector<4xf32>
     vector.transfer_write %mem, %shared[%i] : vector<4xf32>, memref<?xf32, #gpu.address_space<workgroup>>
     %0 = arith.addf %accum, %accum : f32
     scf.yield %0 : f32
   }
   return
 }

 !t = !transform.any_op

 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) {
     %loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t
     // expected-error @below {{irreversible pipelining failure}}
     // expected-note @below {{try setting "peel_epilogue"}}
     transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2 } : (!t) -> !t
     transform.yield
   }
 }

 // -----

 // Loop pipeliner is tested separately, just verify the overall shape of the IR here.

 func.func private @body(index, memref<?xf32, #gpu.address_space<workgroup>>)

 // CHECK-LABEL: @simple_depth_2_peeled
 // CHECK-SAME: %[[ARG:.+]]: memref
 func.func @simple_depth_2_peeled(%global: memref<?xf32>) {
   %c0 = arith.constant 0 : index
   %c100 = arith.constant 100 : index
   %c200 = arith.constant 200 : index
   %c4 = arith.constant 4 : index
   // CHECK: memref.alloc
   %shared = memref.alloc(%c200) : memref<?xf32, #gpu.address_space<workgroup>>
   %c0f = arith.constant 0.0 : f32
   // CHECK: %[[LOADED1:.+]] = vector.transfer_read %[[ARG]]
   // CHECK: %[[LOADED2:.+]] = vector.transfer_read %[[ARG]]
   // CHECK: %[[LOOP:.+]]:2 = scf.for {{.*}} iter_args(%[[IA1:.+]] = %[[LOADED1]], %[[IA2:.+]] = %[[LOADED2]])
   // CHECK:   vector.transfer_write %[[IA1]]
   // CHECK:   func.call @body
   // CHECK:   %[[LOCAL_LOADED:.+]] = vector.transfer_read %[[ARG]]
   // CHECK:   scf.yield %[[IA2]], %[[LOCAL_LOADED]]
   scf.for %i = %c0 to %c100 step %c4 {
     %mem = vector.transfer_read %global[%i], %c0f : memref<?xf32>, vector<4xf32>
     vector.transfer_write %mem, %shared[%i] : vector<4xf32>, memref<?xf32, #gpu.address_space<workgroup>>
     func.call @body(%i, %shared) : (index, memref<?xf32, #gpu.address_space<workgroup>>) -> ()
   }
   // CHECK: vector.transfer_write %[[LOOP]]#0
   // CHECK: call @body
   // CHECK: vector.transfer_write %[[LOOP]]#1
   // CHECK: call @body
   return
 }

 !t = !transform.any_op

 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) {
     %loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t
     transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2, peel_epilogue } : (!t) -> !t
     transform.yield
   }
 }

 // -----

 // CHECK-LABEL: @async_depth_2_predicated
 // CHECK-SAME: %[[GLOBAL:.+]]: memref
 func.func @async_depth_2_predicated(%global: memref<?xf32>, %alloc_size: index) {
   %c0 = arith.constant 0 : index
   %c98 = arith.constant 98 : index
   %c100 = arith.constant 100 : index
   // CHECK-DAG: %[[C4:.+]] = arith.constant 4
   // CHECK-DAG:   %[[C90:.+]] = arith.constant 90
   // CHECK-DAG:   %[[C96:.+]] = arith.constant 96
   // CHECK-DAG:   %[[C8:.+]] = arith.constant 8
   // CHECK-DAG:   %[[C2:.+]] = arith.constant 2
   // CHECK-DAG:   %[[C0:.+]] = arith.constant 0
   %c4 = arith.constant 4 : index
   // CHECK: %[[SHARED:.+]] = memref.alloc{{.*}} #gpu.address_space<workgroup>
   %shared = memref.alloc(%alloc_size) : memref<?xf32, #gpu.address_space<workgroup>>
   %c0f = arith.constant 0.0 : f32
   // CHECK: %[[TOKEN0:.+]] = nvgpu.device_async_copy
   // CHECK: %[[TOKEN1:.+]] = nvgpu.device_async_copy
   // CHECK: scf.for %[[I:.+]] = {{.*}} iter_args
   // CHECK-SAME: %[[ITER_ARG0:.+]] = %[[TOKEN0]]
   // CHECK-SAME: %[[ITER_ARG1:.+]] = %[[TOKEN1]]
   scf.for %i = %c0 to %c98 step %c4 {
     // Condition for the predication "select" below.
     // CHECK:   %[[CMP0:.+]] = arith.cmpi slt, %[[I]], %[[C90]]
     // CHECK:   nvgpu.device_async_wait %[[ITER_ARG0]] {numGroups = 1
     // Original "select" with updated induction variable.
     // CHECK:   %[[I_PLUS_8:.+]] = arith.addi %[[I]], %[[C8]]
     // CHECK:   %[[CMP1:.+]] = arith.cmpi slt, %[[I_PLUS_8]], %[[C96]]
     // CHECK:   %[[SELECTED0:.+]] = arith.select %[[CMP1]], %[[C4]], %[[C2]]
     %c96 = arith.constant 96 : index
     %cond = arith.cmpi slt, %i, %c96 : index
     %c2 = arith.constant 2 : index
     %read_size = arith.select %cond, %c4, %c2 : index

     // Updated induction variables (two more) for the device_async_copy below.
     // These are generated repeatedly by the pipeliner.
     // CHECK:   %[[I_PLUS_8_2:.+]] = arith.addi %[[I]], %[[C8]]
     // CHECK:   %[[I_PLUS_8_3:.+]] = arith.addi %[[I]], %[[C8]]

     // The second "select" is generated by predication and selects 0 for
     // the two last iterations.
     // CHECK:   %[[SELECTED1:.+]] = arith.select %[[CMP0]], %[[SELECTED0]], %[[C0]]
     // CHECK:   %[[ASYNC_TOKEN:.+]] = nvgpu.device_async_copy %[[GLOBAL]][%[[I_PLUS_8_3]]], %[[SHARED]][%[[I_PLUS_8_2]]], 4, %[[SELECTED1]]
     %token = nvgpu.device_async_copy %global[%i], %shared[%i], 4, %read_size
       : memref<?xf32> to memref<?xf32, #gpu.address_space<workgroup>>

     nvgpu.device_async_wait %token

     // CHECK: scf.yield %[[ITER_ARG1]], %[[ASYNC_TOKEN]]
   }
   // There is no need to wait for the last copies as it it was fully predicated
   // out and doesn't load the original data.
   // CHECK-NOT: nvgpu.device_async_wait
   return
 }


 !t = !transform.any_op

 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) {
     %loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t
     transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2 } : (!t) -> !t
     transform.yield
   }
 }

 // -----

 // CHECK-LABEL: @async_depth_2_peeled
 func.func @async_depth_2_peeled(%global: memref<?xf32>) {
   %c0 = arith.constant 0 : index
   %c98 = arith.constant 98 : index
   %c100 = arith.constant 100 : index
   %c4 = arith.constant 4 : index
   %shared = memref.alloc(%c100) : memref<?xf32, #gpu.address_space<workgroup>>
   %c0f = arith.constant 0.0 : f32
   // CHECK: nvgpu.device_async_copy
   // CHECK: nvgpu.device_async_copy
   // CHECK: scf.for
   // CHECK:   nvgpu.device_async_wait %{{.*}} {numGroups = 1
   // CHECK:   arith.select
   // CHECK:   nvgpu.device_async_copy
   // CHECK:   scf.yield
   // CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 1
   // CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 0
   scf.for %i = %c0 to %c98 step %c4 {
     %c96 = arith.constant 96 : index
     %cond = arith.cmpi slt, %i, %c96 : index
     %c2 = arith.constant 2 : index
     %read_size = arith.select %cond, %c4, %c2 : index
     %token = nvgpu.device_async_copy %global[%i], %shared[%i], 4, %read_size
       : memref<?xf32> to memref<?xf32, #gpu.address_space<workgroup>>
     nvgpu.device_async_wait %token
   }
   return
 }


 !t = !transform.any_op

 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) {
     %loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t
     transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2, peel_epilogue } : (!t) -> !t
     transform.yield
   }
 }
	// RUN: mlir-opt %s --transform-interpreter -canonicalize --split-input-file --verify-diagnostics \| FileCheck %s

	func.func @simple_depth_2_unpeeled(%global: memref<?xf32>, %result: memref<?xf32> ) {
	%c0 = arith.constant 0 : index
	%c100 = arith.constant 100 : index
	%c4 = arith.constant 4 : index
	%shared = memref.alloc(%c100) : memref<?xf32, #gpu.address_space<workgroup>>
	%c0f = arith.constant 0.0 : f32
	// Predication is not currently implemented for transfer_read/write, so this is expected to fail.
	// expected-note @below {{couldn't predicate}}
	scf.for %i = %c0 to %c100 step %c4 iter_args(%accum = %c0f) -> f32 {
	%mem = vector.transfer_read %global[%i], %c0f : memref<?xf32>, vector<4xf32>
	vector.transfer_write %mem, %shared[%i] : vector<4xf32>, memref<?xf32, #gpu.address_space<workgroup>>
	%0 = arith.addf %accum, %accum : f32
	scf.yield %0 : f32
	}
	return
	}

	!t = !transform.any_op

	module attributes {transform.with_named_sequence} {
	transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) {
	%loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t
	// expected-error @below {{irreversible pipelining failure}}
	// expected-note @below {{try setting "peel_epilogue"}}
	transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2 } : (!t) -> !t
	transform.yield
	}
	}

	// -----

	// Loop pipeliner is tested separately, just verify the overall shape of the IR here.

	func.func private @body(index, memref<?xf32, #gpu.address_space<workgroup>>)

	// CHECK-LABEL: @simple_depth_2_peeled
	// CHECK-SAME: %[[ARG:.+]]: memref
	func.func @simple_depth_2_peeled(%global: memref<?xf32>) {
	%c0 = arith.constant 0 : index
	%c100 = arith.constant 100 : index
	%c200 = arith.constant 200 : index
	%c4 = arith.constant 4 : index
	// CHECK: memref.alloc
	%shared = memref.alloc(%c200) : memref<?xf32, #gpu.address_space<workgroup>>
	%c0f = arith.constant 0.0 : f32
	// CHECK: %[[LOADED1:.+]] = vector.transfer_read %[[ARG]]
	// CHECK: %[[LOADED2:.+]] = vector.transfer_read %[[ARG]]
	// CHECK: %[[LOOP:.+]]:2 = scf.for {{.*}} iter_args(%[[IA1:.+]] = %[[LOADED1]], %[[IA2:.+]] = %[[LOADED2]])
	// CHECK: vector.transfer_write %[[IA1]]
	// CHECK: func.call @body
	// CHECK: %[[LOCAL_LOADED:.+]] = vector.transfer_read %[[ARG]]
	// CHECK: scf.yield %[[IA2]], %[[LOCAL_LOADED]]
	scf.for %i = %c0 to %c100 step %c4 {
	%mem = vector.transfer_read %global[%i], %c0f : memref<?xf32>, vector<4xf32>
	vector.transfer_write %mem, %shared[%i] : vector<4xf32>, memref<?xf32, #gpu.address_space<workgroup>>
	func.call @body(%i, %shared) : (index, memref<?xf32, #gpu.address_space<workgroup>>) -> ()
	}
	// CHECK: vector.transfer_write %[[LOOP]]#0
	// CHECK: call @body
	// CHECK: vector.transfer_write %[[LOOP]]#1
	// CHECK: call @body
	return
	}

	!t = !transform.any_op

	module attributes {transform.with_named_sequence} {
	transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) {
	%loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t
	transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2, peel_epilogue } : (!t) -> !t
	transform.yield
	}
	}

	// -----

	// CHECK-LABEL: @async_depth_2_predicated
	// CHECK-SAME: %[[GLOBAL:.+]]: memref
	func.func @async_depth_2_predicated(%global: memref<?xf32>, %alloc_size: index) {
	%c0 = arith.constant 0 : index
	%c98 = arith.constant 98 : index
	%c100 = arith.constant 100 : index
	// CHECK-DAG: %[[C4:.+]] = arith.constant 4
	// CHECK-DAG: %[[C90:.+]] = arith.constant 90
	// CHECK-DAG: %[[C96:.+]] = arith.constant 96
	// CHECK-DAG: %[[C8:.+]] = arith.constant 8
	// CHECK-DAG: %[[C2:.+]] = arith.constant 2
	// CHECK-DAG: %[[C0:.+]] = arith.constant 0
	%c4 = arith.constant 4 : index
	// CHECK: %[[SHARED:.+]] = memref.alloc{{.*}} #gpu.address_space<workgroup>
	%shared = memref.alloc(%alloc_size) : memref<?xf32, #gpu.address_space<workgroup>>
	%c0f = arith.constant 0.0 : f32
	// CHECK: %[[TOKEN0:.+]] = nvgpu.device_async_copy
	// CHECK: %[[TOKEN1:.+]] = nvgpu.device_async_copy
	// CHECK: scf.for %[[I:.+]] = {{.*}} iter_args
	// CHECK-SAME: %[[ITER_ARG0:.+]] = %[[TOKEN0]]
	// CHECK-SAME: %[[ITER_ARG1:.+]] = %[[TOKEN1]]
	scf.for %i = %c0 to %c98 step %c4 {
	// Condition for the predication "select" below.
	// CHECK: %[[CMP0:.+]] = arith.cmpi slt, %[[I]], %[[C90]]
	// CHECK: nvgpu.device_async_wait %[[ITER_ARG0]] {numGroups = 1
	// Original "select" with updated induction variable.
	// CHECK: %[[I_PLUS_8:.+]] = arith.addi %[[I]], %[[C8]]
	// CHECK: %[[CMP1:.+]] = arith.cmpi slt, %[[I_PLUS_8]], %[[C96]]
	// CHECK: %[[SELECTED0:.+]] = arith.select %[[CMP1]], %[[C4]], %[[C2]]
	%c96 = arith.constant 96 : index
	%cond = arith.cmpi slt, %i, %c96 : index
	%c2 = arith.constant 2 : index
	%read_size = arith.select %cond, %c4, %c2 : index

	// Updated induction variables (two more) for the device_async_copy below.
	// These are generated repeatedly by the pipeliner.
	// CHECK: %[[I_PLUS_8_2:.+]] = arith.addi %[[I]], %[[C8]]
	// CHECK: %[[I_PLUS_8_3:.+]] = arith.addi %[[I]], %[[C8]]

	// The second "select" is generated by predication and selects 0 for
	// the two last iterations.
	// CHECK: %[[SELECTED1:.+]] = arith.select %[[CMP0]], %[[SELECTED0]], %[[C0]]
	// CHECK: %[[ASYNC_TOKEN:.+]] = nvgpu.device_async_copy %[[GLOBAL]][%[[I_PLUS_8_3]]], %[[SHARED]][%[[I_PLUS_8_2]]], 4, %[[SELECTED1]]
	%token = nvgpu.device_async_copy %global[%i], %shared[%i], 4, %read_size
	: memref<?xf32> to memref<?xf32, #gpu.address_space<workgroup>>

	nvgpu.device_async_wait %token

	// CHECK: scf.yield %[[ITER_ARG1]], %[[ASYNC_TOKEN]]
	}
	// There is no need to wait for the last copies as it it was fully predicated
	// out and doesn't load the original data.
	// CHECK-NOT: nvgpu.device_async_wait
	return
	}


	!t = !transform.any_op

	module attributes {transform.with_named_sequence} {
	transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) {
	%loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t
	transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2 } : (!t) -> !t
	transform.yield
	}
	}

	// -----

	// CHECK-LABEL: @async_depth_2_peeled
	func.func @async_depth_2_peeled(%global: memref<?xf32>) {
	%c0 = arith.constant 0 : index
	%c98 = arith.constant 98 : index
	%c100 = arith.constant 100 : index
	%c4 = arith.constant 4 : index
	%shared = memref.alloc(%c100) : memref<?xf32, #gpu.address_space<workgroup>>
	%c0f = arith.constant 0.0 : f32
	// CHECK: nvgpu.device_async_copy
	// CHECK: nvgpu.device_async_copy
	// CHECK: scf.for
	// CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 1
	// CHECK: arith.select
	// CHECK: nvgpu.device_async_copy
	// CHECK: scf.yield
	// CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 1
	// CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 0
	scf.for %i = %c0 to %c98 step %c4 {
	%c96 = arith.constant 96 : index
	%cond = arith.cmpi slt, %i, %c96 : index
	%c2 = arith.constant 2 : index
	%read_size = arith.select %cond, %c4, %c2 : index
	%token = nvgpu.device_async_copy %global[%i], %shared[%i], 4, %read_size
	: memref<?xf32> to memref<?xf32, #gpu.address_space<workgroup>>
	nvgpu.device_async_wait %token
	}
	return
	}


	!t = !transform.any_op

	module attributes {transform.with_named_sequence} {
	transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) {
	%loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t
	transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2, peel_epilogue } : (!t) -> !t
	transform.yield
	}
	}