test/Dialect/Affine/loop-fusion-inner.mlir - llvm-project/mlir - Git at Google

 // RUN: mlir-opt -pass-pipeline='builtin.module(func.func(affine-loop-fusion{mode=producer fusion-maximal}))' %s | FileCheck %s

 // Test fusion of affine nests inside other region-holding ops (scf.for in the
 // test case below).

 // CHECK-LABEL: func @fusion_inner_simple
 func.func @fusion_inner_simple(%A : memref<10xf32>) {
   %cst = arith.constant 0.0 : f32

   affine.for %i = 0 to 100 {
     %B = memref.alloc() : memref<10xf32>
     %C = memref.alloc() : memref<10xf32>

     affine.for %j = 0 to 10 {
       %v = affine.load %A[%j] : memref<10xf32>
       affine.store %v, %B[%j] : memref<10xf32>
     }

     affine.for %j = 0 to 10 {
       %v = affine.load %B[%j] : memref<10xf32>
       affine.store %v, %C[%j] : memref<10xf32>
     }
   }

   // CHECK:      affine.for %{{.*}} = 0 to 100
   // CHECK-NEXT:   memref.alloc
   // CHECK-NEXT:   memref.alloc
   // CHECK-NEXT:   affine.for %{{.*}} = 0 to 10
   // CHECK-NOT:    affine.for

   return
 }

 // CHECK-LABEL: func @fusion_inner_simple_scf
 func.func @fusion_inner_simple_scf(%A : memref<10xf32>) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c100 = arith.constant 100 : index
   %cst = arith.constant 0.0 : f32

   scf.for %i = %c0 to %c100 step %c1 {
     %B = memref.alloc() : memref<10xf32>
     %C = memref.alloc() : memref<10xf32>

     affine.for %j = 0 to 10 {
       %v = affine.load %A[%j] : memref<10xf32>
       affine.store %v, %B[%j] : memref<10xf32>
     }

     affine.for %j = 0 to 10 {
       %v = affine.load %B[%j] : memref<10xf32>
       affine.store %v, %C[%j] : memref<10xf32>
     }
   }
   // CHECK:      scf.for
   // CHECK-NEXT:   memref.alloc
   // CHECK-NEXT:   memref.alloc
   // CHECK-NEXT:   affine.for %{{.*}} = 0 to 10
   // CHECK-NOT:    affine.for
   return
 }

 // CHECK-LABEL: func @fusion_inner_multiple_nests
 func.func @fusion_inner_multiple_nests() {
   %alloc_5 = memref.alloc() {alignment = 64 : i64} : memref<4x4xi8>
   %alloc_10 = memref.alloc() : memref<8x4xi32>
   affine.for %arg8 = 0 to 4 {
     %alloc_14 = memref.alloc() : memref<4xi8>
     %alloc_15 = memref.alloc() : memref<8x4xi8>
     affine.for %arg9 = 0 to 4 {
       %0 = affine.load %alloc_5[%arg9, %arg8] : memref<4x4xi8>
       affine.store %0, %alloc_14[%arg9] : memref<4xi8>
     }
     %alloc_16 = memref.alloc() : memref<4xi8>
     affine.for %arg9 = 0 to 4 {
       %0 = affine.load %alloc_14[%arg9] : memref<4xi8>
       affine.store %0, %alloc_16[%arg9] : memref<4xi8>
     }
     affine.for %arg9 = 0 to 2 {
       %0 = affine.load %alloc_15[%arg9 * 4, 0] : memref<8x4xi8>
       %1 = affine.load %alloc_16[0] : memref<4xi8>
       %2 = affine.load %alloc_10[%arg9 * 4, %arg8] : memref<8x4xi32>
       %3 = arith.muli %0, %1 : i8
       %4 = arith.extsi %3 : i8 to i32
       %5 = arith.addi %4, %2 : i32
       affine.store %5, %alloc_10[%arg9 * 4 + 3, %arg8] : memref<8x4xi32>
     }
     memref.dealloc %alloc_16 : memref<4xi8>
   }
   // CHECK:      affine.for %{{.*}} = 0 to 4 {
   // Everything inside fused into two nests (the second will be DCE'd).
   // CHECK-NEXT:   memref.alloc() : memref<4xi8>
   // CHECK-NEXT:   memref.alloc() : memref<1xi8>
   // CHECK-NEXT:   memref.alloc() : memref<1xi8>
   // CHECK-NEXT:   memref.alloc() : memref<8x4xi8>
   // CHECK-NEXT:   memref.alloc() : memref<4xi8>
   // CHECK-NEXT:   affine.for %{{.*}} = 0 to 2 {
   // CHECK:        }
   // CHECK:        affine.for %{{.*}} = 0 to 4 {
   // CHECK:        }
   // CHECK-NEXT:   memref.dealloc
   // CHECK-NEXT: }
   // CHECK-NEXT: return
   return
 }

 // CHECK-LABEL: func @fusion_inside_scf_while
 func.func @fusion_inside_scf_while(%A : memref<10xf32>) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c100 = arith.constant 100 : index
   %cst = arith.constant 0.0 : f32

   %0 = scf.while (%arg3 = %cst) : (f32) -> (f32) {
     %1 = arith.cmpf ult, %arg3, %cst : f32
     scf.condition(%1) %arg3 : f32
   } do {
   ^bb0(%arg5: f32):

     %B = memref.alloc() : memref<10xf32>
     %C = memref.alloc() : memref<10xf32>

     affine.for %j = 0 to 10 {
       %v = affine.load %A[%j] : memref<10xf32>
       affine.store %v, %B[%j] : memref<10xf32>
     }

     affine.for %j = 0 to 10 {
       %v = affine.load %B[%j] : memref<10xf32>
       affine.store %v, %C[%j] : memref<10xf32>
     }
     %1 = arith.mulf %arg5, %cst : f32
     scf.yield %1 : f32
   }
   // CHECK:      scf.while
   // CHECK:        affine.for %{{.*}} = 0 to 10
   // CHECK-NOT:    affine.for
   // CHECK:        scf.yield
   return
 }


 memref.global "private" constant @__constant_10x2xf32 : memref<10x2xf32> = dense<0.000000e+00>

 // CHECK-LABEL: func @fusion_inner_long
 func.func @fusion_inner_long(%arg0: memref<10x2xf32>, %arg1: memref<10x10xf32>, %arg2: memref<10x2xf32>, %s: index) {
   %c0 = arith.constant 0 : index
   %cst_0 = arith.constant 1.000000e-03 : f32
   %c9 = arith.constant 9 : index
   %c10_i32 = arith.constant 10 : i32
   %c1_i32 = arith.constant 1 : i32
   %c100_i32 = arith.constant 100 : i32
   %c0_i32 = arith.constant 0 : i32
   %0 = memref.get_global @__constant_10x2xf32 : memref<10x2xf32>
   %1 = scf.for %arg3 = %c0_i32 to %c100_i32 step %c1_i32 iter_args(%arg4 = %arg0) -> (memref<10x2xf32>)  : i32 {
     %alloc = memref.alloc() {alignment = 64 : i64} : memref<10xi32>
     affine.for %arg5 = 0 to 10 {
       %3 = arith.index_cast %arg5 : index to i32
       affine.store %3, %alloc[%arg5] : memref<10xi32>
     }
     %2 = scf.for %arg5 = %c0_i32 to %c10_i32 step %c1_i32 iter_args(%arg6 = %0) -> (memref<10x2xf32>)  : i32 {
       %alloc_5 = memref.alloc() : memref<2xf32>
       affine.for %arg7 = 0 to 2 {
         %16 = affine.load %arg4[%s, %arg7] : memref<10x2xf32>
         affine.store %16, %alloc_5[%arg7] : memref<2xf32>
       }
       %alloc_6 = memref.alloc() {alignment = 64 : i64} : memref<1x2xf32>
       affine.for %arg7 = 0 to 2 {
         %16 = affine.load %alloc_5[%arg7] : memref<2xf32>
         affine.store %16, %alloc_6[0, %arg7] : memref<1x2xf32>
       }
       %alloc_7 = memref.alloc() {alignment = 64 : i64} : memref<10x2xf32>
       affine.for %arg7 = 0 to 10 {
         affine.for %arg8 = 0 to 2 {
           %16 = affine.load %alloc_6[0, %arg8] : memref<1x2xf32>
           affine.store %16, %alloc_7[%arg7, %arg8] : memref<10x2xf32>
         }
       }
       %alloc_8 = memref.alloc() {alignment = 64 : i64} : memref<10x2xf32>
       affine.for %arg7 = 0 to 10 {
         affine.for %arg8 = 0 to 2 {
           %16 = affine.load %alloc_7[%arg7, %arg8] : memref<10x2xf32>
           %17 = affine.load %arg4[%arg7, %arg8] : memref<10x2xf32>
           %18 = arith.subf %16, %17 : f32
           affine.store %18, %alloc_8[%arg7, %arg8] : memref<10x2xf32>
         }
       }
       scf.yield %alloc_8 : memref<10x2xf32>
       // CHECK:      scf.for
       // CHECK:        scf.for
       // CHECK:          affine.for %{{.*}} = 0 to 10
       // CHECK-NEXT:       affine.for %{{.*}} = 0 to 2
       // CHECK-NOT:      affine.for
       // CHECK:          scf.yield
     }
     %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<10x2xf32>
     affine.for %arg5 = 0 to 10 {
       affine.for %arg6 = 0 to 2 {
         affine.store %cst_0, %alloc_2[%arg5, %arg6] : memref<10x2xf32>
       }
     }
     %alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<10x2xf32>
     affine.for %arg5 = 0 to 10 {
       affine.for %arg6 = 0 to 2 {
         %3 = affine.load %alloc_2[%arg5, %arg6] : memref<10x2xf32>
         %4 = affine.load %2[%arg5, %arg6] : memref<10x2xf32>
         %5 = arith.mulf %3, %4 : f32
         affine.store %5, %alloc_3[%arg5, %arg6] : memref<10x2xf32>
       }
     }
     scf.yield %alloc_3 : memref<10x2xf32>
     // The nests above will be fused as well.
     // CHECK:      affine.for %{{.*}} = 0 to 10
     // CHECK-NEXT:   affine.for %{{.*}} = 0 to 2
     // CHECK-NOT:  affine.for
     // CHECK:      scf.yield
   }
   affine.for %arg3 = 0 to 10 {
     affine.for %arg4 = 0 to 2 {
       %2 = affine.load %1[%arg3, %arg4] : memref<10x2xf32>
       affine.store %2, %arg2[%arg3, %arg4] : memref<10x2xf32>
     }
   }
   return
 }
	// RUN: mlir-opt -pass-pipeline='builtin.module(func.func(affine-loop-fusion{mode=producer fusion-maximal}))' %s \| FileCheck %s

	// Test fusion of affine nests inside other region-holding ops (scf.for in the
	// test case below).

	// CHECK-LABEL: func @fusion_inner_simple
	func.func @fusion_inner_simple(%A : memref<10xf32>) {
	%cst = arith.constant 0.0 : f32

	affine.for %i = 0 to 100 {
	%B = memref.alloc() : memref<10xf32>
	%C = memref.alloc() : memref<10xf32>

	affine.for %j = 0 to 10 {
	%v = affine.load %A[%j] : memref<10xf32>
	affine.store %v, %B[%j] : memref<10xf32>
	}

	affine.for %j = 0 to 10 {
	%v = affine.load %B[%j] : memref<10xf32>
	affine.store %v, %C[%j] : memref<10xf32>
	}
	}

	// CHECK: affine.for %{{.*}} = 0 to 100
	// CHECK-NEXT: memref.alloc
	// CHECK-NEXT: memref.alloc
	// CHECK-NEXT: affine.for %{{.*}} = 0 to 10
	// CHECK-NOT: affine.for

	return
	}

	// CHECK-LABEL: func @fusion_inner_simple_scf
	func.func @fusion_inner_simple_scf(%A : memref<10xf32>) {
	%c0 = arith.constant 0 : index
	%c1 = arith.constant 1 : index
	%c100 = arith.constant 100 : index
	%cst = arith.constant 0.0 : f32

	scf.for %i = %c0 to %c100 step %c1 {
	%B = memref.alloc() : memref<10xf32>
	%C = memref.alloc() : memref<10xf32>

	affine.for %j = 0 to 10 {
	%v = affine.load %A[%j] : memref<10xf32>
	affine.store %v, %B[%j] : memref<10xf32>
	}

	affine.for %j = 0 to 10 {
	%v = affine.load %B[%j] : memref<10xf32>
	affine.store %v, %C[%j] : memref<10xf32>
	}
	}
	// CHECK: scf.for
	// CHECK-NEXT: memref.alloc
	// CHECK-NEXT: memref.alloc
	// CHECK-NEXT: affine.for %{{.*}} = 0 to 10
	// CHECK-NOT: affine.for
	return
	}

	// CHECK-LABEL: func @fusion_inner_multiple_nests
	func.func @fusion_inner_multiple_nests() {
	%alloc_5 = memref.alloc() {alignment = 64 : i64} : memref<4x4xi8>
	%alloc_10 = memref.alloc() : memref<8x4xi32>
	affine.for %arg8 = 0 to 4 {
	%alloc_14 = memref.alloc() : memref<4xi8>
	%alloc_15 = memref.alloc() : memref<8x4xi8>
	affine.for %arg9 = 0 to 4 {
	%0 = affine.load %alloc_5[%arg9, %arg8] : memref<4x4xi8>
	affine.store %0, %alloc_14[%arg9] : memref<4xi8>
	}
	%alloc_16 = memref.alloc() : memref<4xi8>
	affine.for %arg9 = 0 to 4 {
	%0 = affine.load %alloc_14[%arg9] : memref<4xi8>
	affine.store %0, %alloc_16[%arg9] : memref<4xi8>
	}
	affine.for %arg9 = 0 to 2 {
	%0 = affine.load %alloc_15[%arg9 * 4, 0] : memref<8x4xi8>
	%1 = affine.load %alloc_16[0] : memref<4xi8>
	%2 = affine.load %alloc_10[%arg9 * 4, %arg8] : memref<8x4xi32>
	%3 = arith.muli %0, %1 : i8
	%4 = arith.extsi %3 : i8 to i32
	%5 = arith.addi %4, %2 : i32
	affine.store %5, %alloc_10[%arg9 * 4 + 3, %arg8] : memref<8x4xi32>
	}
	memref.dealloc %alloc_16 : memref<4xi8>
	}
	// CHECK: affine.for %{{.*}} = 0 to 4 {
	// Everything inside fused into two nests (the second will be DCE'd).
	// CHECK-NEXT: memref.alloc() : memref<4xi8>
	// CHECK-NEXT: memref.alloc() : memref<1xi8>
	// CHECK-NEXT: memref.alloc() : memref<1xi8>
	// CHECK-NEXT: memref.alloc() : memref<8x4xi8>
	// CHECK-NEXT: memref.alloc() : memref<4xi8>
	// CHECK-NEXT: affine.for %{{.*}} = 0 to 2 {
	// CHECK: }
	// CHECK: affine.for %{{.*}} = 0 to 4 {
	// CHECK: }
	// CHECK-NEXT: memref.dealloc
	// CHECK-NEXT: }
	// CHECK-NEXT: return
	return
	}

	// CHECK-LABEL: func @fusion_inside_scf_while
	func.func @fusion_inside_scf_while(%A : memref<10xf32>) {
	%c0 = arith.constant 0 : index
	%c1 = arith.constant 1 : index
	%c100 = arith.constant 100 : index
	%cst = arith.constant 0.0 : f32

	%0 = scf.while (%arg3 = %cst) : (f32) -> (f32) {
	%1 = arith.cmpf ult, %arg3, %cst : f32
	scf.condition(%1) %arg3 : f32
	} do {
	^bb0(%arg5: f32):

	%B = memref.alloc() : memref<10xf32>
	%C = memref.alloc() : memref<10xf32>

	affine.for %j = 0 to 10 {
	%v = affine.load %A[%j] : memref<10xf32>
	affine.store %v, %B[%j] : memref<10xf32>
	}

	affine.for %j = 0 to 10 {
	%v = affine.load %B[%j] : memref<10xf32>
	affine.store %v, %C[%j] : memref<10xf32>
	}
	%1 = arith.mulf %arg5, %cst : f32
	scf.yield %1 : f32
	}
	// CHECK: scf.while
	// CHECK: affine.for %{{.*}} = 0 to 10
	// CHECK-NOT: affine.for
	// CHECK: scf.yield
	return
	}


	memref.global "private" constant @__constant_10x2xf32 : memref<10x2xf32> = dense<0.000000e+00>

	// CHECK-LABEL: func @fusion_inner_long
	func.func @fusion_inner_long(%arg0: memref<10x2xf32>, %arg1: memref<10x10xf32>, %arg2: memref<10x2xf32>, %s: index) {
	%c0 = arith.constant 0 : index
	%cst_0 = arith.constant 1.000000e-03 : f32
	%c9 = arith.constant 9 : index
	%c10_i32 = arith.constant 10 : i32
	%c1_i32 = arith.constant 1 : i32
	%c100_i32 = arith.constant 100 : i32
	%c0_i32 = arith.constant 0 : i32
	%0 = memref.get_global @__constant_10x2xf32 : memref<10x2xf32>
	%1 = scf.for %arg3 = %c0_i32 to %c100_i32 step %c1_i32 iter_args(%arg4 = %arg0) -> (memref<10x2xf32>) : i32 {
	%alloc = memref.alloc() {alignment = 64 : i64} : memref<10xi32>
	affine.for %arg5 = 0 to 10 {
	%3 = arith.index_cast %arg5 : index to i32
	affine.store %3, %alloc[%arg5] : memref<10xi32>
	}
	%2 = scf.for %arg5 = %c0_i32 to %c10_i32 step %c1_i32 iter_args(%arg6 = %0) -> (memref<10x2xf32>) : i32 {
	%alloc_5 = memref.alloc() : memref<2xf32>
	affine.for %arg7 = 0 to 2 {
	%16 = affine.load %arg4[%s, %arg7] : memref<10x2xf32>
	affine.store %16, %alloc_5[%arg7] : memref<2xf32>
	}
	%alloc_6 = memref.alloc() {alignment = 64 : i64} : memref<1x2xf32>
	affine.for %arg7 = 0 to 2 {
	%16 = affine.load %alloc_5[%arg7] : memref<2xf32>
	affine.store %16, %alloc_6[0, %arg7] : memref<1x2xf32>
	}
	%alloc_7 = memref.alloc() {alignment = 64 : i64} : memref<10x2xf32>
	affine.for %arg7 = 0 to 10 {
	affine.for %arg8 = 0 to 2 {
	%16 = affine.load %alloc_6[0, %arg8] : memref<1x2xf32>
	affine.store %16, %alloc_7[%arg7, %arg8] : memref<10x2xf32>
	}
	}
	%alloc_8 = memref.alloc() {alignment = 64 : i64} : memref<10x2xf32>
	affine.for %arg7 = 0 to 10 {
	affine.for %arg8 = 0 to 2 {
	%16 = affine.load %alloc_7[%arg7, %arg8] : memref<10x2xf32>
	%17 = affine.load %arg4[%arg7, %arg8] : memref<10x2xf32>
	%18 = arith.subf %16, %17 : f32
	affine.store %18, %alloc_8[%arg7, %arg8] : memref<10x2xf32>
	}
	}
	scf.yield %alloc_8 : memref<10x2xf32>
	// CHECK: scf.for
	// CHECK: scf.for
	// CHECK: affine.for %{{.*}} = 0 to 10
	// CHECK-NEXT: affine.for %{{.*}} = 0 to 2
	// CHECK-NOT: affine.for
	// CHECK: scf.yield
	}
	%alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<10x2xf32>
	affine.for %arg5 = 0 to 10 {
	affine.for %arg6 = 0 to 2 {
	affine.store %cst_0, %alloc_2[%arg5, %arg6] : memref<10x2xf32>
	}
	}
	%alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<10x2xf32>
	affine.for %arg5 = 0 to 10 {
	affine.for %arg6 = 0 to 2 {
	%3 = affine.load %alloc_2[%arg5, %arg6] : memref<10x2xf32>
	%4 = affine.load %2[%arg5, %arg6] : memref<10x2xf32>
	%5 = arith.mulf %3, %4 : f32
	affine.store %5, %alloc_3[%arg5, %arg6] : memref<10x2xf32>
	}
	}
	scf.yield %alloc_3 : memref<10x2xf32>
	// The nests above will be fused as well.
	// CHECK: affine.for %{{.*}} = 0 to 10
	// CHECK-NEXT: affine.for %{{.*}} = 0 to 2
	// CHECK-NOT: affine.for
	// CHECK: scf.yield
	}
	affine.for %arg3 = 0 to 10 {
	affine.for %arg4 = 0 to 2 {
	%2 = affine.load %1[%arg3, %arg4] : memref<10x2xf32>
	affine.store %2, %arg2[%arg3, %arg4] : memref<10x2xf32>
	}
	}
	return
	}