test/Dialect/NVGPU/optimize-shared-memory.mlir - llvm-project/mlir - Git at Google

 // RUN: mlir-opt %s -split-input-file --pass-pipeline='builtin.module(func.func(nvgpu-optimize-shared-memory))' | FileCheck %s

 // CHECK: @optimize_128x32xf16_32x128xf16([[arg0:%.+]]: memref<{{.*}}>, [[ldRow:%.+]]: index, [[ldCol:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index)
 func.func @optimize_128x32xf16_32x128xf16(%arg0: memref<128x128xf16>,
                                %ldRow: index, %ldCol: index,
                                %stRow: index, %stCol: index,
                                %fragRow: index, %fragCol :index)
                                 -> (vector<4x2xf16>, vector<4x2xf16>) {
   // CHECK: [[shm:%.+]] = memref.alloc
   // CHECK: [[shmB:%.+]] = memref.alloc
   %shm = memref.alloc() : memref<128x32xf16, 3>
   %shmB = memref.alloc() : memref<32x128xf16, 3>

   // CHECK: [[c6:%.+]] = arith.constant 6 : index
   // CHECK: [[src_bits:%.+]] = arith.andi [[stRow]], [[c6]]
   // CHECK: [[c2:%.+]] = arith.constant 2 : index
   // CHECK: [[xorBits:%.+]] = arith.shli [[src_bits]], [[c2]]
   // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol]], [[xorBits]]
   // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stColPerm]]]
   %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8
       : memref<128x128xf16> to memref<128x32xf16, 3>
   %1 = nvgpu.device_async_create_group %0
   nvgpu.device_async_wait %1 { numGroups = 1 : i32}

   // CHECK: [[c6:%.+]] = arith.constant 6 : index
   // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
   // CHECK: [[c2:%.+]] = arith.constant 2 : index
   // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
   // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol]], [[xorBits]]
   // CHECK: nvgpu.ldmatrix [[shm]][[[fragRow]], [[fragColPerm]]]
   %mat = nvgpu.ldmatrix %shm[%fragRow, %fragCol] {numTiles = 4 : i32, transpose = false}
       : memref<128x32xf16, 3> -> vector<4x2xf16>

   // CHECK: [[c15:%.+]] = arith.constant 15 : index
   // CHECK: [[src_bits:%.+]] = arith.andi [[stRow]], [[c15]]
   // CHECK: [[c3:%.+]] = arith.constant 3 : index
   // CHECK: [[xorBits:%.+]] = arith.shli [[src_bits]], [[c3]]
   // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol]], [[xorBits]]
   // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shmB]][[[stRow]], [[stColPerm]]]
   %2 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shmB[%stRow, %stCol], 8
       : memref<128x128xf16> to memref<32x128xf16, 3>
   %3 = nvgpu.device_async_create_group %0
   nvgpu.device_async_wait %1 { numGroups = 1 : i32}

   // CHECK: [[c15:%.+]] = arith.constant 15 : index
   // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c15]]
   // CHECK: [[c3:%.+]] = arith.constant 3 : index
   // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c3]]
   // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol]], [[xorBits]]
   // CHECK: nvgpu.ldmatrix [[shmB]][[[fragRow]], [[fragColPerm]]]
   %matB = nvgpu.ldmatrix %shmB[%fragRow, %fragCol] {numTiles = 4 : i32, transpose = false}
       : memref<32x128xf16, 3> -> vector<4x2xf16>

   return %mat, %matB: vector<4x2xf16>, vector<4x2xf16>
 }


 // -----

 // CHECK: @optimize_64x16xf32_16x64xf32([[arg0:%.+]]: memref<{{.*}}>, [[ldRow:%.+]]: index, [[ldCol:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index)
 func.func @optimize_64x16xf32_16x64xf32(%arg0: memref<128x128xf32>,
                                %ldRow: index, %ldCol: index,
                                %stRow: index, %stCol: index,
                                %fragRow: index, %fragCol :index)
                                 -> (vector<4x1xf32>, vector<4x1xf32>, f32, vector<4xf32>, f32) {
   // CHECK: [[shm:%.+]] = memref.alloc
   // CHECK: [[shmB:%.+]] = memref.alloc
   %shm = memref.alloc() : memref<64x16xf32, 3>
   %shmB = memref.alloc() : memref<16x64xf32, 3>

   // CHECK: [[c6:%.+]] = arith.constant 6 : index
   // CHECK: [[src_bits:%.+]] = arith.andi [[stRow]], [[c6]]
   // CHECK: [[c1:%.+]] = arith.constant 1 : index
   // CHECK: [[xorBits:%.+]] = arith.shli [[src_bits]], [[c1]]
   // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol]], [[xorBits]]
   // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stColPerm]]]
   %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 4
       : memref<128x128xf32> to memref<64x16xf32, 3>
   %1 = nvgpu.device_async_create_group %0
   nvgpu.device_async_wait %1 { numGroups = 1 : i32}

   // CHECK: [[c6:%.+]] = arith.constant 6 : index
   // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
   // CHECK: [[c1:%.+]] = arith.constant 1 : index
   // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c1]]
   // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol]], [[xorBits]]
   // CHECK: nvgpu.ldmatrix [[shm]][[[fragRow]], [[fragColPerm]]]
   %mat = nvgpu.ldmatrix %shm[%fragRow, %fragCol] {numTiles = 4 : i32, transpose = false}
       : memref<64x16xf32, 3> -> vector<4x1xf32>

   // CHECK: [[c6:%.+]] = arith.constant 6 : index
   // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
   // CHECK: [[c1:%.+]] = arith.constant 1 : index
   // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c1]]
   // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol]], [[xorBits]]
   // CHECK: memref.load [[shm]][[[fragRow]], [[fragColPerm]]]
   %elem = memref.load %shm[%fragRow, %fragCol] : memref<64x16xf32, 3>

   // Verify vector operations.

   // CHECK: [[c6:%.+]] = arith.constant 6 : index
   // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
   // CHECK: [[c1:%.+]] = arith.constant 1 : index
   // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c1]]
   // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol]], [[xorBits]]
   // CHECK: vector.load [[shm]][[[fragRow]], [[fragColPerm]]]
   %elem2 = vector.load %shm[%fragRow, %fragCol] : memref<64x16xf32, 3>, vector<4xf32>

   // CHECK: [[c6:%.+]] = arith.constant 6 : index
   // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
   // CHECK: [[c1:%.+]] = arith.constant 1 : index
   // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c1]]
   // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol]], [[xorBits]]
   // CHECK: vector.store %{{.+}}, [[shm]][[[fragRow]], [[fragColPerm]]]
   vector.store %elem2, %shm[%fragRow, %fragCol] : memref<64x16xf32, 3>, vector<4xf32>

   // CHECK: [[c6:%.+]] = arith.constant 6 : index
   // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
   // CHECK: [[c1:%.+]] = arith.constant 1 : index
   // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c1]]
   // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol]], [[xorBits]]
   // CHECK: memref.store %{{.+}}, [[shm]][[[fragRow]], [[fragColPerm]]]
   memref.store %elem, %shm[%fragRow, %fragCol] : memref<64x16xf32, 3>

   // Verify 16x64xf32 memory size.

   // CHECK: [[c15:%.+]] = arith.constant 15 : index
   // CHECK: [[src_bits:%.+]] = arith.andi [[stRow]], [[c15]]
   // CHECK: [[c2:%.+]] = arith.constant 2 : index
   // CHECK: [[xorBits:%.+]] = arith.shli [[src_bits]], [[c2]]
   // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol]], [[xorBits]]
   // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shmB]][[[stRow]], [[stColPerm]]]
   %2 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shmB[%stRow, %stCol], 4
       : memref<128x128xf32> to memref<16x64xf32, 3>
   %3 = nvgpu.device_async_create_group %0
   nvgpu.device_async_wait %1 { numGroups = 1 : i32}

   // CHECK: [[c15:%.+]] = arith.constant 15 : index
   // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c15]]
   // CHECK: [[c2:%.+]] = arith.constant 2 : index
   // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
   // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol]], [[xorBits]]
   // CHECK: nvgpu.ldmatrix [[shmB]][[[fragRow]], [[fragColPerm]]]
   %matB = nvgpu.ldmatrix %shmB[%fragRow, %fragCol] {numTiles = 4 : i32, transpose = false}
       : memref<16x64xf32, 3> -> vector<4x1xf32>

   // CHECK: [[c15:%.+]] = arith.constant 15 : index
   // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c15]]
   // CHECK: [[c2:%.+]] = arith.constant 2 : index
   // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
   // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol]], [[xorBits]]
   // CHECK: memref.load [[shmB]][[[fragRow]], [[fragColPerm]]]
   %elemB = memref.load %shmB[%fragRow, %fragCol] : memref<16x64xf32, 3>

   return %mat, %matB, %elem, %elem2, %elemB: vector<4x1xf32>, vector<4x1xf32>, f32, vector<4xf32>, f32
 }


 // -----

 // Small column edge cases

 // CHECK: @small_column_size_f64([[arg0:%.+]]: memref<{{.*}}>, [[ldRow:%.+]]: index, [[ldCol:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index)
 func.func @small_column_size_f64(%arg0: memref<32x32xf64>,
                                %ldRow: index, %ldCol: index,
                                %stRow: index, %stCol: index,
                                %fragRow: index, %fragCol :index)
                                 -> f64 {
   // CHECK: [[shm:%.+]] = memref.alloc
   %shm = memref.alloc() : memref<32x4xf64, 3>

   // CHECK: [[c4:%.+]] = arith.constant 4 : index
   // CHECK: [[src_bits:%.+]] = arith.andi [[stRow]], [[c4]]
   // CHECK: [[c1:%.+]] = arith.constant 1 : index
   // CHECK: [[xorBits:%.+]] = arith.shrui [[src_bits]], [[c1]]
   // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol]], [[xorBits]]
   // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stColPerm]]]
   %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 2
       : memref<32x32xf64> to memref<32x4xf64, 3>
   %1 = nvgpu.device_async_create_group %0
   nvgpu.device_async_wait %1 { numGroups = 1 : i32}

   // CHECK: [[c6:%.+]] = arith.constant 4 : index
   // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
   // CHECK: [[c1:%.+]] = arith.constant 1 : index
   // CHECK: [[xorBits:%.+]] = arith.shrui [[srcBits]], [[c1]]
   // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol]], [[xorBits]]
   // CHECK: memref.load [[shm]][[[fragRow]], [[fragColPerm]]]
   %el = memref.load %shm[%fragRow, %fragCol] : memref<32x4xf64, 3>

   return %el: f64
 }

 // CHECK: @too_small_column_size_f16([[arg0:%.+]]: memref<{{.*}}>, [[ldRow:%.+]]: index, [[ldCol:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index)
 func.func @too_small_column_size_f16(%arg0: memref<128x128xf16>,
                                %ldRow: index, %ldCol: index,
                                %stRow: index, %stCol: index,
                                %fragRow: index, %fragCol :index)
                                 -> vector<1x2xf16> {
   // CHECK: [[shm:%.+]] = memref.alloc
   %shm = memref.alloc() : memref<128x8xf16, 3>

   // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stCol]]]
   %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8
       : memref<128x128xf16> to memref<128x8xf16, 3>
   %1 = nvgpu.device_async_create_group %0
   nvgpu.device_async_wait %1 { numGroups = 1 : i32}

   // CHECK: nvgpu.ldmatrix [[shm]][[[fragRow]], [[fragCol]]]
   %mat = nvgpu.ldmatrix %shm[%fragRow, %fragCol] {numTiles = 1 : i32, transpose = false}
       : memref<128x8xf16, 3> -> vector<1x2xf16>

   return %mat: vector<1x2xf16>
 }

 // -----

 // CHECK: @abort_if_subview([[arg0:%.+]]: memref<{{.*}}>, [[ldRow:%.+]]: index, [[ldCol:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index)
 func.func @abort_if_subview(%arg0: memref<128x128xf16>,
                                %ldRow: index, %ldCol: index,
                                %stRow: index, %stCol: index,
                                %fragRow: index, %fragCol :index)
                                 -> vector<1x2xf16> {
   // CHECK: [[shm:%.+]] = memref.alloc
   %shm = memref.alloc() : memref<128x32xf16, 3>
   // CHECK: [[shmView:%.+]] = memref.subview
   %shmView = memref.subview %shm[0, 0][64, 32][1, 1] : memref<128x32xf16, 3> to memref<64x32xf16, 3>

   // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stCol]]]
   %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8
       : memref<128x128xf16> to memref<128x32xf16, 3>
   %1 = nvgpu.device_async_create_group %0
   nvgpu.device_async_wait %1 { numGroups = 1 : i32}

   // CHECK: nvgpu.ldmatrix [[shmView]][[[fragRow]], [[fragCol]]]
   %mat = nvgpu.ldmatrix %shmView[%fragRow, %fragCol] {numTiles = 1 : i32, transpose = false}
       : memref<64x32xf16, 3> -> vector<1x2xf16>

   return %mat: vector<1x2xf16>
 }
	// RUN: mlir-opt %s -split-input-file --pass-pipeline='builtin.module(func.func(nvgpu-optimize-shared-memory))' \| FileCheck %s

	// CHECK: @optimize_128x32xf16_32x128xf16([[arg0:%.+]]: memref<{{.*}}>, [[ldRow:%.+]]: index, [[ldCol:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index)
	func.func @optimize_128x32xf16_32x128xf16(%arg0: memref<128x128xf16>,
	%ldRow: index, %ldCol: index,
	%stRow: index, %stCol: index,
	%fragRow: index, %fragCol :index)
	-> (vector<4x2xf16>, vector<4x2xf16>) {
	// CHECK: [[shm:%.+]] = memref.alloc
	// CHECK: [[shmB:%.+]] = memref.alloc
	%shm = memref.alloc() : memref<128x32xf16, 3>
	%shmB = memref.alloc() : memref<32x128xf16, 3>

	// CHECK: [[c6:%.+]] = arith.constant 6 : index
	// CHECK: [[src_bits:%.+]] = arith.andi [[stRow]], [[c6]]
	// CHECK: [[c2:%.+]] = arith.constant 2 : index
	// CHECK: [[xorBits:%.+]] = arith.shli [[src_bits]], [[c2]]
	// CHECK: [[stColPerm:%.+]] = arith.xori [[stCol]], [[xorBits]]
	// CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stColPerm]]]
	%0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8
	: memref<128x128xf16> to memref<128x32xf16, 3>
	%1 = nvgpu.device_async_create_group %0
	nvgpu.device_async_wait %1 { numGroups = 1 : i32}

	// CHECK: [[c6:%.+]] = arith.constant 6 : index
	// CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
	// CHECK: [[c2:%.+]] = arith.constant 2 : index
	// CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
	// CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol]], [[xorBits]]
	// CHECK: nvgpu.ldmatrix [[shm]][[[fragRow]], [[fragColPerm]]]
	%mat = nvgpu.ldmatrix %shm[%fragRow, %fragCol] {numTiles = 4 : i32, transpose = false}
	: memref<128x32xf16, 3> -> vector<4x2xf16>

	// CHECK: [[c15:%.+]] = arith.constant 15 : index
	// CHECK: [[src_bits:%.+]] = arith.andi [[stRow]], [[c15]]
	// CHECK: [[c3:%.+]] = arith.constant 3 : index
	// CHECK: [[xorBits:%.+]] = arith.shli [[src_bits]], [[c3]]
	// CHECK: [[stColPerm:%.+]] = arith.xori [[stCol]], [[xorBits]]
	// CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shmB]][[[stRow]], [[stColPerm]]]
	%2 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shmB[%stRow, %stCol], 8
	: memref<128x128xf16> to memref<32x128xf16, 3>
	%3 = nvgpu.device_async_create_group %0
	nvgpu.device_async_wait %1 { numGroups = 1 : i32}

	// CHECK: [[c15:%.+]] = arith.constant 15 : index
	// CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c15]]
	// CHECK: [[c3:%.+]] = arith.constant 3 : index
	// CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c3]]
	// CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol]], [[xorBits]]
	// CHECK: nvgpu.ldmatrix [[shmB]][[[fragRow]], [[fragColPerm]]]
	%matB = nvgpu.ldmatrix %shmB[%fragRow, %fragCol] {numTiles = 4 : i32, transpose = false}
	: memref<32x128xf16, 3> -> vector<4x2xf16>

	return %mat, %matB: vector<4x2xf16>, vector<4x2xf16>
	}


	// -----

	// CHECK: @optimize_64x16xf32_16x64xf32([[arg0:%.+]]: memref<{{.*}}>, [[ldRow:%.+]]: index, [[ldCol:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index)
	func.func @optimize_64x16xf32_16x64xf32(%arg0: memref<128x128xf32>,
	%ldRow: index, %ldCol: index,
	%stRow: index, %stCol: index,
	%fragRow: index, %fragCol :index)
	-> (vector<4x1xf32>, vector<4x1xf32>, f32, vector<4xf32>, f32) {
	// CHECK: [[shm:%.+]] = memref.alloc
	// CHECK: [[shmB:%.+]] = memref.alloc
	%shm = memref.alloc() : memref<64x16xf32, 3>
	%shmB = memref.alloc() : memref<16x64xf32, 3>

	// CHECK: [[c6:%.+]] = arith.constant 6 : index
	// CHECK: [[src_bits:%.+]] = arith.andi [[stRow]], [[c6]]
	// CHECK: [[c1:%.+]] = arith.constant 1 : index
	// CHECK: [[xorBits:%.+]] = arith.shli [[src_bits]], [[c1]]
	// CHECK: [[stColPerm:%.+]] = arith.xori [[stCol]], [[xorBits]]
	// CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stColPerm]]]
	%0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 4
	: memref<128x128xf32> to memref<64x16xf32, 3>
	%1 = nvgpu.device_async_create_group %0
	nvgpu.device_async_wait %1 { numGroups = 1 : i32}

	// CHECK: [[c6:%.+]] = arith.constant 6 : index
	// CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
	// CHECK: [[c1:%.+]] = arith.constant 1 : index
	// CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c1]]
	// CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol]], [[xorBits]]
	// CHECK: nvgpu.ldmatrix [[shm]][[[fragRow]], [[fragColPerm]]]
	%mat = nvgpu.ldmatrix %shm[%fragRow, %fragCol] {numTiles = 4 : i32, transpose = false}
	: memref<64x16xf32, 3> -> vector<4x1xf32>

	// CHECK: [[c6:%.+]] = arith.constant 6 : index
	// CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
	// CHECK: [[c1:%.+]] = arith.constant 1 : index
	// CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c1]]
	// CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol]], [[xorBits]]
	// CHECK: memref.load [[shm]][[[fragRow]], [[fragColPerm]]]
	%elem = memref.load %shm[%fragRow, %fragCol] : memref<64x16xf32, 3>

	// Verify vector operations.

	// CHECK: [[c6:%.+]] = arith.constant 6 : index
	// CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
	// CHECK: [[c1:%.+]] = arith.constant 1 : index
	// CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c1]]
	// CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol]], [[xorBits]]
	// CHECK: vector.load [[shm]][[[fragRow]], [[fragColPerm]]]
	%elem2 = vector.load %shm[%fragRow, %fragCol] : memref<64x16xf32, 3>, vector<4xf32>

	// CHECK: [[c6:%.+]] = arith.constant 6 : index
	// CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
	// CHECK: [[c1:%.+]] = arith.constant 1 : index
	// CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c1]]
	// CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol]], [[xorBits]]
	// CHECK: vector.store %{{.+}}, [[shm]][[[fragRow]], [[fragColPerm]]]
	vector.store %elem2, %shm[%fragRow, %fragCol] : memref<64x16xf32, 3>, vector<4xf32>

	// CHECK: [[c6:%.+]] = arith.constant 6 : index
	// CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
	// CHECK: [[c1:%.+]] = arith.constant 1 : index
	// CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c1]]
	// CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol]], [[xorBits]]
	// CHECK: memref.store %{{.+}}, [[shm]][[[fragRow]], [[fragColPerm]]]
	memref.store %elem, %shm[%fragRow, %fragCol] : memref<64x16xf32, 3>

	// Verify 16x64xf32 memory size.

	// CHECK: [[c15:%.+]] = arith.constant 15 : index
	// CHECK: [[src_bits:%.+]] = arith.andi [[stRow]], [[c15]]
	// CHECK: [[c2:%.+]] = arith.constant 2 : index
	// CHECK: [[xorBits:%.+]] = arith.shli [[src_bits]], [[c2]]
	// CHECK: [[stColPerm:%.+]] = arith.xori [[stCol]], [[xorBits]]
	// CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shmB]][[[stRow]], [[stColPerm]]]
	%2 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shmB[%stRow, %stCol], 4
	: memref<128x128xf32> to memref<16x64xf32, 3>
	%3 = nvgpu.device_async_create_group %0
	nvgpu.device_async_wait %1 { numGroups = 1 : i32}

	// CHECK: [[c15:%.+]] = arith.constant 15 : index
	// CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c15]]
	// CHECK: [[c2:%.+]] = arith.constant 2 : index
	// CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
	// CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol]], [[xorBits]]
	// CHECK: nvgpu.ldmatrix [[shmB]][[[fragRow]], [[fragColPerm]]]
	%matB = nvgpu.ldmatrix %shmB[%fragRow, %fragCol] {numTiles = 4 : i32, transpose = false}
	: memref<16x64xf32, 3> -> vector<4x1xf32>

	// CHECK: [[c15:%.+]] = arith.constant 15 : index
	// CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c15]]
	// CHECK: [[c2:%.+]] = arith.constant 2 : index
	// CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
	// CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol]], [[xorBits]]
	// CHECK: memref.load [[shmB]][[[fragRow]], [[fragColPerm]]]
	%elemB = memref.load %shmB[%fragRow, %fragCol] : memref<16x64xf32, 3>

	return %mat, %matB, %elem, %elem2, %elemB: vector<4x1xf32>, vector<4x1xf32>, f32, vector<4xf32>, f32
	}


	// -----

	// Small column edge cases

	// CHECK: @small_column_size_f64([[arg0:%.+]]: memref<{{.*}}>, [[ldRow:%.+]]: index, [[ldCol:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index)
	func.func @small_column_size_f64(%arg0: memref<32x32xf64>,
	%ldRow: index, %ldCol: index,
	%stRow: index, %stCol: index,
	%fragRow: index, %fragCol :index)
	-> f64 {
	// CHECK: [[shm:%.+]] = memref.alloc
	%shm = memref.alloc() : memref<32x4xf64, 3>

	// CHECK: [[c4:%.+]] = arith.constant 4 : index
	// CHECK: [[src_bits:%.+]] = arith.andi [[stRow]], [[c4]]
	// CHECK: [[c1:%.+]] = arith.constant 1 : index
	// CHECK: [[xorBits:%.+]] = arith.shrui [[src_bits]], [[c1]]
	// CHECK: [[stColPerm:%.+]] = arith.xori [[stCol]], [[xorBits]]
	// CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stColPerm]]]
	%0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 2
	: memref<32x32xf64> to memref<32x4xf64, 3>
	%1 = nvgpu.device_async_create_group %0
	nvgpu.device_async_wait %1 { numGroups = 1 : i32}

	// CHECK: [[c6:%.+]] = arith.constant 4 : index
	// CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
	// CHECK: [[c1:%.+]] = arith.constant 1 : index
	// CHECK: [[xorBits:%.+]] = arith.shrui [[srcBits]], [[c1]]
	// CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol]], [[xorBits]]
	// CHECK: memref.load [[shm]][[[fragRow]], [[fragColPerm]]]
	%el = memref.load %shm[%fragRow, %fragCol] : memref<32x4xf64, 3>

	return %el: f64
	}

	// CHECK: @too_small_column_size_f16([[arg0:%.+]]: memref<{{.*}}>, [[ldRow:%.+]]: index, [[ldCol:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index)
	func.func @too_small_column_size_f16(%arg0: memref<128x128xf16>,
	%ldRow: index, %ldCol: index,
	%stRow: index, %stCol: index,
	%fragRow: index, %fragCol :index)
	-> vector<1x2xf16> {
	// CHECK: [[shm:%.+]] = memref.alloc
	%shm = memref.alloc() : memref<128x8xf16, 3>

	// CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stCol]]]
	%0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8
	: memref<128x128xf16> to memref<128x8xf16, 3>
	%1 = nvgpu.device_async_create_group %0
	nvgpu.device_async_wait %1 { numGroups = 1 : i32}

	// CHECK: nvgpu.ldmatrix [[shm]][[[fragRow]], [[fragCol]]]
	%mat = nvgpu.ldmatrix %shm[%fragRow, %fragCol] {numTiles = 1 : i32, transpose = false}
	: memref<128x8xf16, 3> -> vector<1x2xf16>

	return %mat: vector<1x2xf16>
	}

	// -----

	// CHECK: @abort_if_subview([[arg0:%.+]]: memref<{{.*}}>, [[ldRow:%.+]]: index, [[ldCol:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index)
	func.func @abort_if_subview(%arg0: memref<128x128xf16>,
	%ldRow: index, %ldCol: index,
	%stRow: index, %stCol: index,
	%fragRow: index, %fragCol :index)
	-> vector<1x2xf16> {
	// CHECK: [[shm:%.+]] = memref.alloc
	%shm = memref.alloc() : memref<128x32xf16, 3>
	// CHECK: [[shmView:%.+]] = memref.subview
	%shmView = memref.subview %shm[0, 0][64, 32][1, 1] : memref<128x32xf16, 3> to memref<64x32xf16, 3>

	// CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stCol]]]
	%0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8
	: memref<128x128xf16> to memref<128x32xf16, 3>
	%1 = nvgpu.device_async_create_group %0
	nvgpu.device_async_wait %1 { numGroups = 1 : i32}

	// CHECK: nvgpu.ldmatrix [[shmView]][[[fragRow]], [[fragCol]]]
	%mat = nvgpu.ldmatrix %shmView[%fragRow, %fragCol] {numTiles = 1 : i32, transpose = false}
	: memref<64x32xf16, 3> -> vector<1x2xf16>

	return %mat: vector<1x2xf16>
	}