test/Dialect/Affine/affine-data-copy.mlir - llvm-project/mlir - Git at Google

 // RUN: mlir-opt %s -split-input-file -affine-data-copy-generate="generate-dma=false fast-mem-space=0 skip-non-unit-stride-loops" | FileCheck %s
 // Small buffer size to trigger fine copies.
 // RUN: mlir-opt %s -split-input-file -affine-data-copy-generate="generate-dma=false fast-mem-space=0 fast-mem-capacity=1" | FileCheck --check-prefix=CHECK-SMALL %s

 // Test affine data copy with a memref filter. We use a test pass that invokes
 // affine data copy utility on the input loop nest.
 // '-test-affine-data-copy-memref-filter' passes the first memref found in an
 // affine.load op in the innermost loop as a filter.
 // RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter' | FileCheck %s --check-prefix=FILTER
 // RUN: mlir-opt %s -split-input-file -test-affine-data-copy='for-memref-region' | FileCheck %s --check-prefix=MEMREF_REGION

 // -copy-skip-non-stride-loops forces the copies to be placed right inside the
 // tile space loops, avoiding the sensitivity of copy placement depth to memory
 // footprint -- so that one could write a definite test case and not have to
 // update it each time something related to the cost functions change.

 #id = affine_map<(d0) -> (d0)>
 #ub = affine_map<(d0) -> (d0 + 128)>

 // Map used to index the buffer while computing.
 // CHECK-DAG: [[$MAP_IDENTITY:map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0)>
 // CHECK-DAG: [[$MAP_PLUS_128:map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 + 128)>

 // CHECK-LABEL: func @matmul
 // FILTER-LABEL: func @matmul
 func.func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<4096x4096xf32>) -> memref<4096x4096xf32> {
   affine.for %i = 0 to 4096 step 128 {
     affine.for %j = 0 to 4096 step 128 {
       affine.for %k = 0 to 4096 step 128 {
         affine.for %ii = #id(%i) to #ub(%i) {
           affine.for %jj = #id(%j) to #ub(%j) {
             affine.for %kk = #id(%k) to #ub(%k) {
               %5 = affine.load %A[%ii, %kk] : memref<4096x4096xf32>
               %6 = affine.load %B[%kk, %jj] : memref<4096x4096xf32>
               %7 = affine.load %C[%ii, %jj] : memref<4096x4096xf32>
               %8 = arith.mulf %5, %6 : f32
               %9 = arith.addf %7, %8 : f32
               affine.store %9, %C[%ii, %jj] : memref<4096x4096xf32>
             }
           }
         }
       }
     }
   }
   return %C : memref<4096x4096xf32>
 }

 // Buffers of size 128x128 get created here for all three matrices.

 // CHECK: affine.for %[[I:.*]] = 0 to 4096 step 128 {
 // CHECK:   affine.for %[[J:.*]] = 0 to 4096 step 128 {
 // CHECK:     [[BUFC:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<128x128xf32>
 // The result matrix's copy gets hoisted out.
 // Result matrix copy-in.
 // CHECK:     affine.for %[[II:.*]] = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
 // CHECK:       affine.for %[[JJ:.*]] = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
 // CHECK:         affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
 // CHECK:         affine.store %{{.*}}, [[BUFC]][%[[II]] - %[[I]], %[[JJ]] - %[[J]]] : memref<128x128xf32>
 // CHECK:       }
 // CHECK:     }

 // LHS matrix copy-in.
 // CHECK:     affine.for %[[K:.*]] = 0 to 4096 step 128 {
 // CHECK:      [[BUFA:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<128x128xf32>
 // CHECK:       affine.for %[[II:.*]] = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
 // CHECK:         affine.for %[[KK:.*]] = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
 // CHECK:           affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
 // CHECK:           affine.store %{{.*}}, [[BUFA]][%[[II]] - %[[I]], %[[KK]] - %[[K]]] : memref<128x128xf32>
 // CHECK:         }
 // CHECK:       }

 // RHS matrix copy-in.
 // CHECK:       [[BUFB:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<128x128xf32>
 // CHECK:       affine.for %[[KK:.*]] = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
 // CHECK:         affine.for %[[JJ:.*]] = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
 // CHECK:           affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
 // CHECK:           affine.store %{{.*}}, [[BUFB]][%[[KK]] - %[[K]], %[[JJ]] - %[[J]]] : memref<128x128xf32>
 // CHECK:         }
 // CHECK:       }

 // Computation on the fast buffers.
 // CHECK:       affine.for %{{.*}} = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
 // CHECK:         affine.for %{{.*}} = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
 // CHECK:           affine.for %{{.*}} = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
 // CHECK:             affine.load [[BUFA]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
 // CHECK:             affine.load [[BUFB]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
 // CHECK:             affine.load [[BUFC]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
 // CHECK:             arith.mulf %{{.*}}, %{{.*}} : f32
 // CHECK:             arith.addf %{{.*}}, %{{.*}} : f32
 // CHECK:             affine.store %{{.*}}, [[BUFC]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
 // CHECK:           }
 // CHECK:         }
 // CHECK:       }
 // CHECK:       memref.dealloc [[BUFB]] : memref<128x128xf32>
 // CHECK:       memref.dealloc [[BUFA]] : memref<128x128xf32>
 // CHECK:     }

 // Result matrix copy out.
 // CHECK:     affine.for %{{.*}} = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
 // CHECK:       affine.for %{{.*}} = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
 // CHECK:         affine.load [[BUFC]][%{{.*}} - %{{.*}}, %{{.*}} - %{{.*}}] : memref<128x128xf32>
 // CHECK:         store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
 // CHECK:       }
 // CHECK:     }
 // CHECK:     memref.dealloc [[BUFC]] : memref<128x128xf32>
 // CHECK:   }
 // CHECK: }

 // Check that only one memref is copied when memref filter is used.

 //      FILTER: affine.for %{{.*}} = 0 to 4096 step 128 {
 //      FILTER:   memref.alloc() : memref<128x4096xf32>
 //  FILTER-NOT:   memref.alloc()
 //      FILTER:   affine.for
 //      FILTER:     affine.for %{{.*}} = 0 to 4096 {
 //      FILTER:   affine.for %{{.*}} = 0 to 4096 step 128 {
 // FILTER-NEXT:     affine.for %{{.*}} = 0 to 4096 step 128 {
 // FILTER-NEXT:       affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
 // FILTER-NEXT:         affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
 // FILTER-NEXT:           affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
 //      FILTER:   memref.dealloc %{{.*}} : memref<128x4096xf32>
 //  FILTER-NOT:   memref.dealloc %{{.*}} : memref<128x4096xf32>

 // -----

 //
 // This test case will lead to single element buffers. These are eventually
 // expected to be turned into registers via alloca and mem2reg.
 //
 // CHECK-SMALL-LABEL: func @single_elt_buffers
 // FILTER-LABEL: func @single_elt_buffers
 // MEMREF_REGION-LABEL: func @single_elt_buffers
 func.func @single_elt_buffers(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> {
   affine.for %i = 0 to 1024 {
     affine.for %j = 0 to 1024 {
       affine.for %k = 0 to 1024 {
         %6 = affine.load %arg1[%k, %j] : memref<1024x1024xf32>
         %7 = affine.load %arg2[%i, %j] : memref<1024x1024xf32>
         %9 = arith.addf %6, %7 : f32
         affine.store %9, %arg2[%i, %j] : memref<1024x1024xf32>
       }
     }
   }
   return %arg2 : memref<1024x1024xf32>
 }
 // CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 {
 // CHECK-SMALL:   affine.for %arg{{.*}} = 0 to 1024 {
 // CHECK-SMALL:     memref.alloc() : memref<1x1xf32>
 // CHECK-SMALL:     affine.load %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
 // CHECK-SMALL:     affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
 // CHECK-SMALL:     affine.for %arg{{.*}} = 0 to 1024 {
 // CHECK-SMALL:       memref.alloc() : memref<1x1xf32>
 // CHECK-SMALL:       affine.load %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
 // CHECK-SMALL:       affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
 // CHECK-SMALL:       affine.load %{{.*}}[0, 0] : memref<1x1xf32>
 // CHECK-SMALL:       affine.load %{{.*}}[0, 0] : memref<1x1xf32>
 // CHECK-SMALL:       arith.addf %{{.*}}, %{{.*}} : f32
 // CHECK-SMALL:       affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
 // CHECK-SMALL:       memref.dealloc %{{.*}} : memref<1x1xf32>
 // CHECK-SMALL:     }
 // CHECK-SMALL:     affine.load %{{.*}}[0, 0] : memref<1x1xf32>
 // CHECK-SMALL:     affine.store %{{.*}}, %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
 // CHECK-SMALL:     memref.dealloc %{{.*}} : memref<1x1xf32>
 // CHECK-SMALL:   }
 // CHECK-SMALL: }
 // CHECK-SMALL: return

 // Check that only one memref is copied when memref filter is used.

 //      FILTER: memref.alloc() : memref<1024x1024xf32>
 //  FILTER-NOT: memref.alloc()
 //      FILTER: affine.for %{{.*}} = 0 to 1024 {
 //      FILTER:   affine.for %{{.*}} = 0 to 1024 {
 //      FILTER: affine.for %{{.*}} = 0 to 1024 {
 // FILTER-NEXT:   affine.for %{{.*}} = 0 to 1024 {
 // FILTER-NEXT:     affine.for %{{.*}} = 0 to 1024 {
 //      FILTER: memref.dealloc %{{.*}} : memref<1024x1024xf32>
 //  FILTER-NOT: memref.dealloc
 //  FILTER:     return

 // CHeck that only one memref is copied, because for-memref-region is enabled
 // (and the first ever encountered load is analyzed).
 //      MEMREF_REGION: memref.alloc() : memref<1024x1024xf32>
 //  MEMREF_REGION-NOT: memref.alloc()
 //      MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
 //      MEMREF_REGION:   affine.for %{{.*}} = 0 to 1024 {
 //      MEMREF_REGION:   }
 //      MEMREF_REGION: }
 // MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 {
 // MEMREF_REGION-NEXT:   affine.for %{{.*}} = 0 to 1024 {
 // MEMREF_REGION-NEXT:     affine.for %{{.*}} = 0 to 1024 {
 //      MEMREF_REGION: memref.dealloc %{{.*}} : memref<1024x1024xf32>
 // MEMREF_REGION-NOT: memref.dealloc
 // MEMREF_REGION-NEXT: return

 // -----

 // This pattern typically appears with tiling with tile sizes that don't divide
 // the loop trip counts.

 #map_ub = affine_map<(d0) -> (4096, d0 + 100)>

 // CHECK-DAG: [[$MAP_IDENTITY:map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0)>
 // CHECK-DAG: [[$MAP_MIN_UB1:map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 + 100, 4096)>
 // CHECK-DAG: [[$MAP_MIN_UB2:map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (4096, d0 + 100)>

 // CHECK-LABEL: func @min_upper_bound
 func.func @min_upper_bound(%A: memref<4096xf32>) -> memref<4096xf32> {
   affine.for %i = 0 to 4096 step 100 {
     affine.for %ii = affine_map<(d0) -> (d0)>(%i) to min #map_ub(%i) {
       %5 = affine.load %A[%ii] : memref<4096xf32>
       %6 = arith.mulf %5, %5 : f32
       affine.store %6, %A[%ii] : memref<4096xf32>
     }
   }
   return %A : memref<4096xf32>
 }
 // CHECK:      affine.for %[[IV1:.*]] = 0 to 4096 step 100
 // CHECK:        %[[BUF:.*]] = memref.alloc() : memref<100xf32>
 // CHECK-NEXT:   affine.for %[[IV2:.*]] = #[[$MAP_IDENTITY]](%[[IV1]]) to min #[[$MAP_MIN_UB1]](%[[IV1]]) {
 // CHECK-NEXT:     affine.load %{{.*}}[%[[IV2]]] : memref<4096xf32>
 // CHECK-NEXT:     affine.store %{{.*}}, %[[BUF]][%[[IV2]] - %[[IV1]]] : memref<100xf32>
 // CHECK-NEXT:   }
 // CHECK-NEXT:   affine.for %[[IV2:.*]] = #[[$MAP_IDENTITY]](%[[IV1]]) to min #[[$MAP_MIN_UB2]](%[[IV1]]) {
 // CHECK-NEXT:     affine.load %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
 // CHECK-NEXT:     arith.mulf
 // CHECK-NEXT:     affine.store %{{.*}}, %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
 // CHECK-NEXT:   }
 // CHECK:        affine.for %[[IV2:.*]] = #[[$MAP_IDENTITY]](%[[IV1]]) to min #[[$MAP_MIN_UB1]](%[[IV1]]) {
 // CHECK-NEXT:     affine.load %[[BUF]][%[[IV2]] - %[[IV1]]] : memref<100xf32>
 // CHECK-NEXT:     affine.store %{{.*}}, %{{.*}}[%[[IV2]]] : memref<4096xf32>
 // CHECK-NEXT:   }
 // CHECK-NEXT:   memref.dealloc %[[BUF]] : memref<100xf32>
 // CHECK-NEXT: }

 // -----

 // Lower bound is a max; upper bound is a min. This pattern typically appears
 // with multi-level tiling when the tile sizes used don't divide loop trip
 // counts.

 #lb = affine_map<()[s0, s1] -> (s0 * 512, s1 * 6)>
 #ub = affine_map<()[s0, s1] -> (s0 * 512 + 512, s1 * 6 + 6)>

 // CHECK-DAG: #[[$LB:.*]] = affine_map<()[s0, s1] -> (s0 * 512, s1 * 6)>
 // CHECK-DAG: #[[$UB:.*]] = affine_map<()[s0, s1] -> (s0 * 512 + 512, s1 * 6 + 6)>

 // CHECK-LABEL: max_lower_bound(%{{.*}}: memref<2048x516xf64>,
 // CHECK-SAME: [[i:arg[0-9a-zA-Z_]+]]
 // CHECK-SAME: [[j:arg[0-9a-zA-Z_]+]]
 func.func @max_lower_bound(%M: memref<2048x516xf64>, %i : index, %j : index) {
   affine.for %ii = 0 to 2048 {
     affine.for %jj = max #lb()[%i, %j] to min #ub()[%i, %j] {
       affine.load %M[%ii, %jj] : memref<2048x516xf64>
     }
   }
   return
 }

 // CHECK:      %[[BUF:.*]] = memref.alloc() : memref<2048x6xf64>
 // CHECK-NEXT: affine.for %[[ii:.*]] = 0 to 2048 {
 // CHECK-NEXT:   affine.for %[[jj:.*]] = max #[[$LB]]()[%[[i]], %[[j]]] to min #[[$UB]]()[%[[i]], %[[j]]] {
 // CHECK-NEXT:      affine.load %{{.*}}[%[[ii]], %[[jj]]] : memref<2048x516xf64>
 // CHECK-NEXT:      affine.store %{{.*}}, %[[BUF]][%[[ii]], %[[jj]] - symbol(%[[j]]) * 6] : memref<2048x6xf64>
 // CHECK-NEXT:   }
 // CHECK-NEXT: }
 // CHECK-NEXT: affine.for %[[ii_:.*]] = 0 to 2048 {
 // CHECK-NEXT:   affine.for %[[jj_:.*]] = max #[[$LB]]()[%{{.*}}, %{{.*}}] to min #[[$UB]]()[%{{.*}}, %{{.*}}] {
 // CHECK-NEXT:     affine.load %[[BUF]][%[[ii_]], %[[jj_]] - symbol(%[[j]]) * 6] : memref<2048x6xf64>
 // CHECK-NEXT:    }
 // CHECK-NEXT: }
 // CHECK-NEXT: memref.dealloc %[[BUF]] : memref<2048x6xf64>

 // -----

 // CHECK-LABEL: func @empty_loops
 func.func @empty_loops(%arg0: memref<1024x1024xf64>) {
   // Empty loops - so no copy generation happens.
   affine.for %i = 0 to 0 {
     affine.load %arg0[0, %i] : memref<1024x1024xf64>
   }
   affine.for %i = 0 to -16 {
     affine.load %arg0[0, %i] : memref<1024x1024xf64>
   }
   return
   // CHECK-NOT:    memref.alloc
   // CHECK:        return
 }

 #map16 = affine_map<(d0, d1, d2) -> (d0 * 40 + d1 * 8 + d2 * 2)>
 #map17 = affine_map<(d0, d1, d2) -> (d0 * 40 + d1 * 8 + d2 * 2 + 2)>
 // CHECK-LABEL: func @affine_parallel
 func.func @affine_parallel(%85:memref<2x5x4x2xi64>) {
   affine.for %arg0 = 0 to 2 {
     affine.parallel (%arg1) = (0) to (5) {
       affine.parallel (%arg2) = (0) to (4) {
         affine.for %arg3 = #map16(%arg0, %arg1, %arg2) to #map17(%arg0, %arg1, %arg2) {
           %105 = affine.load %85[((%arg3 floordiv 2) floordiv 4) floordiv 5, ((%arg3 floordiv 2) floordiv 4) mod 5, (%arg3 floordiv 2) mod 4, %arg3 mod 2] : memref<2x5x4x2xi64>
         }
       }
     }
   }
   // CHECK:     affine.for
   // CHECK-NEXT:  affine.for %{{.*}} = 0 to 5
   // CHECK-NEXT:    affine.for %{{.*}} = 0 to 4
   // CHECK-NEXT:      affine.for %{{.*}} = 0 to 2

   // CHECK:     affine.for
   // CHECK-NEXT:  affine.parallel
   // CHECK-NEXT:    affine.parallel
   return
 }

 // CHECK-LABEL: func @index_elt_type
 func.func @index_elt_type(%arg0: memref<1x2x4x8xindex>) {
   affine.for %arg1 = 0 to 1 {
     affine.for %arg2 = 0 to 2 {
       affine.for %arg3 = 0 to 4 {
         affine.for %arg4 = 0 to 8 {
           affine.store %arg4, %arg0[%arg1, %arg2, %arg3, %arg4] : memref<1x2x4x8xindex>
         }
       }
     }
   }

   // CHECK:     affine.for %{{.*}} = 0 to 1
   // CHECK-NEXT:  affine.for %{{.*}} = 0 to 2
   // CHECK-NEXT:    affine.for %{{.*}} = 0 to 4
   // CHECK-NEXT:      affine.for %{{.*}} = 0 to 8

   // CHECK:     affine.for %{{.*}} = 0 to 2
   // CHECK-NEXT:  affine.for %{{.*}} = 0 to 4
   // CHECK-NEXT:    affine.for %{{.*}} = 0 to 8
   return
 }
	// RUN: mlir-opt %s -split-input-file -affine-data-copy-generate="generate-dma=false fast-mem-space=0 skip-non-unit-stride-loops" \| FileCheck %s
	// Small buffer size to trigger fine copies.
	// RUN: mlir-opt %s -split-input-file -affine-data-copy-generate="generate-dma=false fast-mem-space=0 fast-mem-capacity=1" \| FileCheck --check-prefix=CHECK-SMALL %s

	// Test affine data copy with a memref filter. We use a test pass that invokes
	// affine data copy utility on the input loop nest.
	// '-test-affine-data-copy-memref-filter' passes the first memref found in an
	// affine.load op in the innermost loop as a filter.
	// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter' \| FileCheck %s --check-prefix=FILTER
	// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='for-memref-region' \| FileCheck %s --check-prefix=MEMREF_REGION

	// -copy-skip-non-stride-loops forces the copies to be placed right inside the
	// tile space loops, avoiding the sensitivity of copy placement depth to memory
	// footprint -- so that one could write a definite test case and not have to
	// update it each time something related to the cost functions change.

	#id = affine_map<(d0) -> (d0)>
	#ub = affine_map<(d0) -> (d0 + 128)>

	// Map used to index the buffer while computing.
	// CHECK-DAG: [[$MAP_IDENTITY:map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0)>
	// CHECK-DAG: [[$MAP_PLUS_128:map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 + 128)>

	// CHECK-LABEL: func @matmul
	// FILTER-LABEL: func @matmul
	func.func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<4096x4096xf32>) -> memref<4096x4096xf32> {
	affine.for %i = 0 to 4096 step 128 {
	affine.for %j = 0 to 4096 step 128 {
	affine.for %k = 0 to 4096 step 128 {
	affine.for %ii = #id(%i) to #ub(%i) {
	affine.for %jj = #id(%j) to #ub(%j) {
	affine.for %kk = #id(%k) to #ub(%k) {
	%5 = affine.load %A[%ii, %kk] : memref<4096x4096xf32>
	%6 = affine.load %B[%kk, %jj] : memref<4096x4096xf32>
	%7 = affine.load %C[%ii, %jj] : memref<4096x4096xf32>
	%8 = arith.mulf %5, %6 : f32
	%9 = arith.addf %7, %8 : f32
	affine.store %9, %C[%ii, %jj] : memref<4096x4096xf32>
	}
	}
	}
	}
	}
	}
	return %C : memref<4096x4096xf32>
	}

	// Buffers of size 128x128 get created here for all three matrices.

	// CHECK: affine.for %[[I:.*]] = 0 to 4096 step 128 {
	// CHECK: affine.for %[[J:.*]] = 0 to 4096 step 128 {
	// CHECK: [[BUFC:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<128x128xf32>
	// The result matrix's copy gets hoisted out.
	// Result matrix copy-in.
	// CHECK: affine.for %[[II:.]] = #[[$MAP_IDENTITY]](%{{.}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
	// CHECK: affine.for %[[JJ:.]] = #[[$MAP_IDENTITY]](%{{.}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
	// CHECK: affine.load %{{.}}[%{{.}}, %{{.*}}] : memref<4096x4096xf32>
	// CHECK: affine.store %{{.*}}, [[BUFC]][%[[II]] - %[[I]], %[[JJ]] - %[[J]]] : memref<128x128xf32>
	// CHECK: }
	// CHECK: }

	// LHS matrix copy-in.
	// CHECK: affine.for %[[K:.*]] = 0 to 4096 step 128 {
	// CHECK: [[BUFA:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<128x128xf32>
	// CHECK: affine.for %[[II:.]] = #[[$MAP_IDENTITY]](%{{.}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
	// CHECK: affine.for %[[KK:.]] = #[[$MAP_IDENTITY]](%{{.}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
	// CHECK: affine.load %{{.}}[%{{.}}, %{{.*}}] : memref<4096x4096xf32>
	// CHECK: affine.store %{{.*}}, [[BUFA]][%[[II]] - %[[I]], %[[KK]] - %[[K]]] : memref<128x128xf32>
	// CHECK: }
	// CHECK: }

	// RHS matrix copy-in.
	// CHECK: [[BUFB:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<128x128xf32>
	// CHECK: affine.for %[[KK:.]] = #[[$MAP_IDENTITY]](%{{.}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
	// CHECK: affine.for %[[JJ:.]] = #[[$MAP_IDENTITY]](%{{.}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
	// CHECK: affine.load %{{.}}[%{{.}}, %{{.*}}] : memref<4096x4096xf32>
	// CHECK: affine.store %{{.*}}, [[BUFB]][%[[KK]] - %[[K]], %[[JJ]] - %[[J]]] : memref<128x128xf32>
	// CHECK: }
	// CHECK: }

	// Computation on the fast buffers.
	// CHECK: affine.for %{{.}} = #[[$MAP_IDENTITY]](%{{.}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
	// CHECK: affine.for %{{.}} = #[[$MAP_IDENTITY]](%{{.}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
	// CHECK: affine.for %{{.}} = #[[$MAP_IDENTITY]](%{{.}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
	// CHECK: affine.load [[BUFA]][-%{{.}} + %{{.}}, -%{{.}} + %{{.}}] : memref<128x128xf32>
	// CHECK: affine.load [[BUFB]][-%{{.}} + %{{.}}, -%{{.}} + %{{.}}] : memref<128x128xf32>
	// CHECK: affine.load [[BUFC]][-%{{.}} + %{{.}}, -%{{.}} + %{{.}}] : memref<128x128xf32>
	// CHECK: arith.mulf %{{.}}, %{{.}} : f32
	// CHECK: arith.addf %{{.}}, %{{.}} : f32
	// CHECK: affine.store %{{.}}, [[BUFC]][-%{{.}} + %{{.}}, -%{{.}} + %{{.*}}] : memref<128x128xf32>
	// CHECK: }
	// CHECK: }
	// CHECK: }
	// CHECK: memref.dealloc [[BUFB]] : memref<128x128xf32>
	// CHECK: memref.dealloc [[BUFA]] : memref<128x128xf32>
	// CHECK: }

	// Result matrix copy out.
	// CHECK: affine.for %{{.}} = #[[$MAP_IDENTITY]](%{{.}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
	// CHECK: affine.for %{{.}} = #[[$MAP_IDENTITY]](%{{.}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
	// CHECK: affine.load [[BUFC]][%{{.}} - %{{.}}, %{{.}} - %{{.}}] : memref<128x128xf32>
	// CHECK: store %{{.}}, %{{.}}[%{{.}}, %{{.}}] : memref<4096x4096xf32>
	// CHECK: }
	// CHECK: }
	// CHECK: memref.dealloc [[BUFC]] : memref<128x128xf32>
	// CHECK: }
	// CHECK: }

	// Check that only one memref is copied when memref filter is used.

	// FILTER: affine.for %{{.*}} = 0 to 4096 step 128 {
	// FILTER: memref.alloc() : memref<128x4096xf32>
	// FILTER-NOT: memref.alloc()
	// FILTER: affine.for
	// FILTER: affine.for %{{.*}} = 0 to 4096 {
	// FILTER: affine.for %{{.*}} = 0 to 4096 step 128 {
	// FILTER-NEXT: affine.for %{{.*}} = 0 to 4096 step 128 {
	// FILTER-NEXT: affine.for %{{.}} = #map{{.}}(%{{.}}) to #map{{.}}(%{{.*}}) {
	// FILTER-NEXT: affine.for %{{.}} = #map{{.}}(%{{.}}) to #map{{.}}(%{{.*}}) {
	// FILTER-NEXT: affine.for %{{.}} = #map{{.}}(%{{.}}) to #map{{.}}(%{{.*}}) {
	// FILTER: memref.dealloc %{{.*}} : memref<128x4096xf32>
	// FILTER-NOT: memref.dealloc %{{.*}} : memref<128x4096xf32>

	// -----

	//
	// This test case will lead to single element buffers. These are eventually
	// expected to be turned into registers via alloca and mem2reg.
	//
	// CHECK-SMALL-LABEL: func @single_elt_buffers
	// FILTER-LABEL: func @single_elt_buffers
	// MEMREF_REGION-LABEL: func @single_elt_buffers
	func.func @single_elt_buffers(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> {
	affine.for %i = 0 to 1024 {
	affine.for %j = 0 to 1024 {
	affine.for %k = 0 to 1024 {
	%6 = affine.load %arg1[%k, %j] : memref<1024x1024xf32>
	%7 = affine.load %arg2[%i, %j] : memref<1024x1024xf32>
	%9 = arith.addf %6, %7 : f32
	affine.store %9, %arg2[%i, %j] : memref<1024x1024xf32>
	}
	}
	}
	return %arg2 : memref<1024x1024xf32>
	}
	// CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 {
	// CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 {
	// CHECK-SMALL: memref.alloc() : memref<1x1xf32>
	// CHECK-SMALL: affine.load %arg{{.}}[%{{.}}, %{{.*}}] : memref<1024x1024xf32>
	// CHECK-SMALL: affine.store %{{.}}, %{{.}}[0, 0] : memref<1x1xf32>
	// CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 {
	// CHECK-SMALL: memref.alloc() : memref<1x1xf32>
	// CHECK-SMALL: affine.load %arg{{.}}[%{{.}}, %{{.*}}] : memref<1024x1024xf32>
	// CHECK-SMALL: affine.store %{{.}}, %{{.}}[0, 0] : memref<1x1xf32>
	// CHECK-SMALL: affine.load %{{.*}}[0, 0] : memref<1x1xf32>
	// CHECK-SMALL: affine.load %{{.*}}[0, 0] : memref<1x1xf32>
	// CHECK-SMALL: arith.addf %{{.}}, %{{.}} : f32
	// CHECK-SMALL: affine.store %{{.}}, %{{.}}[0, 0] : memref<1x1xf32>
	// CHECK-SMALL: memref.dealloc %{{.*}} : memref<1x1xf32>
	// CHECK-SMALL: }
	// CHECK-SMALL: affine.load %{{.*}}[0, 0] : memref<1x1xf32>
	// CHECK-SMALL: affine.store %{{.}}, %arg{{.}}[%{{.}}, %{{.}}] : memref<1024x1024xf32>
	// CHECK-SMALL: memref.dealloc %{{.*}} : memref<1x1xf32>
	// CHECK-SMALL: }
	// CHECK-SMALL: }
	// CHECK-SMALL: return

	// Check that only one memref is copied when memref filter is used.

	// FILTER: memref.alloc() : memref<1024x1024xf32>
	// FILTER-NOT: memref.alloc()
	// FILTER: affine.for %{{.*}} = 0 to 1024 {
	// FILTER: affine.for %{{.*}} = 0 to 1024 {
	// FILTER: affine.for %{{.*}} = 0 to 1024 {
	// FILTER-NEXT: affine.for %{{.*}} = 0 to 1024 {
	// FILTER-NEXT: affine.for %{{.*}} = 0 to 1024 {
	// FILTER: memref.dealloc %{{.*}} : memref<1024x1024xf32>
	// FILTER-NOT: memref.dealloc
	// FILTER: return

	// CHeck that only one memref is copied, because for-memref-region is enabled
	// (and the first ever encountered load is analyzed).
	// MEMREF_REGION: memref.alloc() : memref<1024x1024xf32>
	// MEMREF_REGION-NOT: memref.alloc()
	// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
	// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
	// MEMREF_REGION: }
	// MEMREF_REGION: }
	// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 {
	// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 {
	// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 {
	// MEMREF_REGION: memref.dealloc %{{.*}} : memref<1024x1024xf32>
	// MEMREF_REGION-NOT: memref.dealloc
	// MEMREF_REGION-NEXT: return

	// -----

	// This pattern typically appears with tiling with tile sizes that don't divide
	// the loop trip counts.

	#map_ub = affine_map<(d0) -> (4096, d0 + 100)>

	// CHECK-DAG: [[$MAP_IDENTITY:map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0)>
	// CHECK-DAG: [[$MAP_MIN_UB1:map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 + 100, 4096)>
	// CHECK-DAG: [[$MAP_MIN_UB2:map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (4096, d0 + 100)>

	// CHECK-LABEL: func @min_upper_bound
	func.func @min_upper_bound(%A: memref<4096xf32>) -> memref<4096xf32> {
	affine.for %i = 0 to 4096 step 100 {
	affine.for %ii = affine_map<(d0) -> (d0)>(%i) to min #map_ub(%i) {
	%5 = affine.load %A[%ii] : memref<4096xf32>
	%6 = arith.mulf %5, %5 : f32
	affine.store %6, %A[%ii] : memref<4096xf32>
	}
	}
	return %A : memref<4096xf32>
	}
	// CHECK: affine.for %[[IV1:.*]] = 0 to 4096 step 100
	// CHECK: %[[BUF:.*]] = memref.alloc() : memref<100xf32>
	// CHECK-NEXT: affine.for %[[IV2:.*]] = #[[$MAP_IDENTITY]](%[[IV1]]) to min #[[$MAP_MIN_UB1]](%[[IV1]]) {
	// CHECK-NEXT: affine.load %{{.*}}[%[[IV2]]] : memref<4096xf32>
	// CHECK-NEXT: affine.store %{{.*}}, %[[BUF]][%[[IV2]] - %[[IV1]]] : memref<100xf32>
	// CHECK-NEXT: }
	// CHECK-NEXT: affine.for %[[IV2:.*]] = #[[$MAP_IDENTITY]](%[[IV1]]) to min #[[$MAP_MIN_UB2]](%[[IV1]]) {
	// CHECK-NEXT: affine.load %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
	// CHECK-NEXT: arith.mulf
	// CHECK-NEXT: affine.store %{{.*}}, %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
	// CHECK-NEXT: }
	// CHECK: affine.for %[[IV2:.*]] = #[[$MAP_IDENTITY]](%[[IV1]]) to min #[[$MAP_MIN_UB1]](%[[IV1]]) {
	// CHECK-NEXT: affine.load %[[BUF]][%[[IV2]] - %[[IV1]]] : memref<100xf32>
	// CHECK-NEXT: affine.store %{{.}}, %{{.}}[%[[IV2]]] : memref<4096xf32>
	// CHECK-NEXT: }
	// CHECK-NEXT: memref.dealloc %[[BUF]] : memref<100xf32>
	// CHECK-NEXT: }

	// -----

	// Lower bound is a max; upper bound is a min. This pattern typically appears
	// with multi-level tiling when the tile sizes used don't divide loop trip
	// counts.

	#lb = affine_map<()[s0, s1] -> (s0 * 512, s1 * 6)>
	#ub = affine_map<()[s0, s1] -> (s0 * 512 + 512, s1 * 6 + 6)>

	// CHECK-DAG: #[[$LB:.]] = affine_map<()[s0, s1] -> (s0 512, s1 * 6)>
	// CHECK-DAG: #[[$UB:.]] = affine_map<()[s0, s1] -> (s0 512 + 512, s1 * 6 + 6)>

	// CHECK-LABEL: max_lower_bound(%{{.*}}: memref<2048x516xf64>,
	// CHECK-SAME: [[i:arg[0-9a-zA-Z_]+]]
	// CHECK-SAME: [[j:arg[0-9a-zA-Z_]+]]
	func.func @max_lower_bound(%M: memref<2048x516xf64>, %i : index, %j : index) {
	affine.for %ii = 0 to 2048 {
	affine.for %jj = max #lb()[%i, %j] to min #ub()[%i, %j] {
	affine.load %M[%ii, %jj] : memref<2048x516xf64>
	}
	}
	return
	}

	// CHECK: %[[BUF:.*]] = memref.alloc() : memref<2048x6xf64>
	// CHECK-NEXT: affine.for %[[ii:.*]] = 0 to 2048 {
	// CHECK-NEXT: affine.for %[[jj:.*]] = max #[[$LB]]()[%[[i]], %[[j]]] to min #[[$UB]]()[%[[i]], %[[j]]] {
	// CHECK-NEXT: affine.load %{{.*}}[%[[ii]], %[[jj]]] : memref<2048x516xf64>
	// CHECK-NEXT: affine.store %{{.}}, %[[BUF]][%[[ii]], %[[jj]] - symbol(%[[j]]) 6] : memref<2048x6xf64>
	// CHECK-NEXT: }
	// CHECK-NEXT: }
	// CHECK-NEXT: affine.for %[[ii_:.*]] = 0 to 2048 {
	// CHECK-NEXT: affine.for %[[jj_:.]] = max #[[$LB]]()[%{{.}}, %{{.}}] to min #[[$UB]]()[%{{.}}, %{{.*}}] {
	// CHECK-NEXT: affine.load %[[BUF]][%[[ii_]], %[[jj_]] - symbol(%[[j]]) * 6] : memref<2048x6xf64>
	// CHECK-NEXT: }
	// CHECK-NEXT: }
	// CHECK-NEXT: memref.dealloc %[[BUF]] : memref<2048x6xf64>

	// -----

	// CHECK-LABEL: func @empty_loops
	func.func @empty_loops(%arg0: memref<1024x1024xf64>) {
	// Empty loops - so no copy generation happens.
	affine.for %i = 0 to 0 {
	affine.load %arg0[0, %i] : memref<1024x1024xf64>
	}
	affine.for %i = 0 to -16 {
	affine.load %arg0[0, %i] : memref<1024x1024xf64>
	}
	return
	// CHECK-NOT: memref.alloc
	// CHECK: return
	}

	#map16 = affine_map<(d0, d1, d2) -> (d0 * 40 + d1 * 8 + d2 * 2)>
	#map17 = affine_map<(d0, d1, d2) -> (d0 * 40 + d1 * 8 + d2 * 2 + 2)>
	// CHECK-LABEL: func @affine_parallel
	func.func @affine_parallel(%85:memref<2x5x4x2xi64>) {
	affine.for %arg0 = 0 to 2 {
	affine.parallel (%arg1) = (0) to (5) {
	affine.parallel (%arg2) = (0) to (4) {
	affine.for %arg3 = #map16(%arg0, %arg1, %arg2) to #map17(%arg0, %arg1, %arg2) {
	%105 = affine.load %85[((%arg3 floordiv 2) floordiv 4) floordiv 5, ((%arg3 floordiv 2) floordiv 4) mod 5, (%arg3 floordiv 2) mod 4, %arg3 mod 2] : memref<2x5x4x2xi64>
	}
	}
	}
	}
	// CHECK: affine.for
	// CHECK-NEXT: affine.for %{{.*}} = 0 to 5
	// CHECK-NEXT: affine.for %{{.*}} = 0 to 4
	// CHECK-NEXT: affine.for %{{.*}} = 0 to 2

	// CHECK: affine.for
	// CHECK-NEXT: affine.parallel
	// CHECK-NEXT: affine.parallel
	return
	}

	// CHECK-LABEL: func @index_elt_type
	func.func @index_elt_type(%arg0: memref<1x2x4x8xindex>) {
	affine.for %arg1 = 0 to 1 {
	affine.for %arg2 = 0 to 2 {
	affine.for %arg3 = 0 to 4 {
	affine.for %arg4 = 0 to 8 {
	affine.store %arg4, %arg0[%arg1, %arg2, %arg3, %arg4] : memref<1x2x4x8xindex>
	}
	}
	}
	}

	// CHECK: affine.for %{{.*}} = 0 to 1
	// CHECK-NEXT: affine.for %{{.*}} = 0 to 2
	// CHECK-NEXT: affine.for %{{.*}} = 0 to 4
	// CHECK-NEXT: affine.for %{{.*}} = 0 to 8

	// CHECK: affine.for %{{.*}} = 0 to 2
	// CHECK-NEXT: affine.for %{{.*}} = 0 to 4
	// CHECK-NEXT: affine.for %{{.*}} = 0 to 8
	return
	}