[mlir][tensor] Fix off-by-one error in ReshapeOpsUtils (#112774)

This patch fixes an off-by-one error in
`mlir::getReassociationIndicesForCollapse()` that occurs when the last
two dims of the source tensor satisfy the while loop.

This would cause an assertion failure due to out-of-bounds-access, which
is now fixed.
diff --git a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
index d2ab4ca..70b2aaf 100644
--- a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
+++ b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
@@ -47,7 +47,7 @@
       break;
 
     int64_t currTargetShape = targetShape[targetDim];
-    while (sourceDim < sourceShape.size() &&
+    while (sourceDim < (sourceShape.size() - 1) &&
            sourceShape[sourceDim] != ShapedType::kDynamic &&
            prodOfCollapsedDims * sourceShape[sourceDim] < currTargetShape) {
       prodOfCollapsedDims *= sourceShape[sourceDim];
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index 0aa2d33..dbf0f0b 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -1251,6 +1251,29 @@
 
 // -----
 
+func.func @compose_expand_of_collapse_last_two_dims(%arg0: tensor<?x64x1xf32>) -> tensor<?x384xf32> {
+  %collapsed = tensor.collapse_shape %arg0 [[0, 1, 2]] : tensor<?x64x1xf32> into tensor<?xf32>
+  %c0 = arith.constant 0 : index
+  %dim = tensor.dim %collapsed, %c0 : tensor<?xf32>
+  %c384= arith.constant 384 : index
+  %div = arith.divui %dim, %c384 : index
+  %expanded = tensor.expand_shape %collapsed [[0, 1]] output_shape [%div, 384] : tensor<?xf32> into tensor<?x384xf32>
+  return %expanded : tensor<?x384xf32>
+}
+//       CHECK: #[[$MAP:.*]] = affine_map<()[s0] -> (s0 * 64)>
+// CHECK-LABEL: @compose_expand_of_collapse_last_two_dims
+//  CHECK-SAME: %[[ARG0:.+]]: tensor<?x64x1xf32>
+//       CHECK: %[[CONSTANT0:.+]] = arith.constant 0 : index
+//       CHECK: %[[CONSTANT384:.+]] = arith.constant 384 : index
+//       CHECK: %[[COLLAPSE:.+]] = tensor.collapse_shape %[[ARG0]] {{\[}}[0, 1, 2]] : tensor<?x64x1xf32> into tensor<?xf32>
+//       CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[CONSTANT0]] : tensor<?x64x1xf32>
+//       CHECK: %[[AFFAPPLY:.+]] = affine.apply #[[$MAP]]()[%[[DIM]]]
+//       CHECK: %[[DIVUI:.+]] = arith.divui %[[AFFAPPLY]], %[[CONSTANT384]] : index
+//       CHECK: %[[RESULT:.+]] = tensor.expand_shape %[[COLLAPSE]] {{\[}}[0, 1]] output_shape [%[[DIVUI]], 384] : tensor<?xf32> into tensor<?x384xf32>
+//       CHECK: return %[[RESULT]]
+
+// -----
+
 func.func @compose_expand_of_collapse(%arg0 : tensor<2x3x4x5x6x7x8xf32>)
     -> tensor<24x5x42x8xf32> {
   %0 = tensor.collapse_shape %arg0 [[0, 1, 2, 3, 4, 5, 6]]