[flang][cuda] Accept scalar expression for bytes in kernel call (#165040)

GitOrigin-RevId: 825eefe856cb957adf33924a9232d3f7e947e7f4
diff --git a/include/flang/Parser/parse-tree.h b/include/flang/Parser/parse-tree.h
index be64ef3..bb47f31 100644
--- a/include/flang/Parser/parse-tree.h
+++ b/include/flang/Parser/parse-tree.h
@@ -3274,13 +3274,13 @@
 // R1521 call-stmt -> CALL procedure-designator [ chevrons ]
 //         [( [actual-arg-spec-list] )]
 // (CUDA) chevrons -> <<< * | scalar-expr, scalar-expr [,
-//          scalar-int-expr [, scalar-int-expr ] ] >>>
+//          scalar-expr [, scalar-int-expr ] ] >>>
 struct CallStmt {
   BOILERPLATE(CallStmt);
   WRAPPER_CLASS(StarOrExpr, std::optional<ScalarExpr>);
   struct Chevrons {
     TUPLE_CLASS_BOILERPLATE(Chevrons);
-    std::tuple<StarOrExpr, ScalarExpr, std::optional<ScalarIntExpr>,
+    std::tuple<StarOrExpr, ScalarExpr, std::optional<ScalarExpr>,
         std::optional<ScalarIntExpr>>
         t;
   };
diff --git a/lib/Parser/program-parsers.cpp b/lib/Parser/program-parsers.cpp
index 92c0a64..740dbbf 100644
--- a/lib/Parser/program-parsers.cpp
+++ b/lib/Parser/program-parsers.cpp
@@ -484,7 +484,7 @@
         applyFunction(presentOptional<ScalarExpr>, scalarExpr))};
 TYPE_PARSER(extension<LanguageFeature::CUDA>(
     "<<<" >> construct<CallStmt::Chevrons>(starOrExpr, ", " >> scalarExpr,
-                 maybe("," >> scalarIntExpr), maybe("," >> scalarIntExpr)) /
+                 maybe("," >> scalarExpr), maybe("," >> scalarIntExpr)) /
         ">>>"))
 constexpr auto actualArgSpecList{optionalList(actualArgSpec)};
 TYPE_CONTEXT_PARSER("CALL statement"_en_US,
diff --git a/test/Lower/CUDA/cuda-kernel-calls.cuf b/test/Lower/CUDA/cuda-kernel-calls.cuf
index 71e594e..e0941f7 100644
--- a/test/Lower/CUDA/cuda-kernel-calls.cuf
+++ b/test/Lower/CUDA/cuda-kernel-calls.cuf
@@ -16,6 +16,7 @@
   subroutine host()
     real, device :: a
     integer(8) :: stream
+    integer(4) :: nbytes
 
 ! CHECK-LABEL: func.func @_QMtest_callPhost()
 ! CHECK: %[[A:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QMtest_callFhostEa"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
@@ -57,6 +58,10 @@
     call dev_kernel1<<<*,32,0,stream>>>(a)
 ! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel1<<<%c-1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}, %c0{{.*}}, %{{.*}} : !fir.ref<i64>>>>(%{{.*}}) : (!fir.ref<f32>)
 
+    call dev_kernel1<<<*, 32, 0.8 * nbytes>>>(a)
+! CHECK: %[[MUL:.*]] = arith.mulf %{{.*}}, %{{.*}} fastmath<contract> : f32
+! CHECK: %[[BYTES:.*]] = fir.convert %[[MUL]] : (f32) -> i32
+! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel1<<<%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[BYTES]]>>>(%{{.*}}) : (!fir.ref<f32>)
   end
 
 end
diff --git a/test/Parser/cuf-sanity-common b/test/Parser/cuf-sanity-common
index 816e03b..2348c2e 100644
--- a/test/Parser/cuf-sanity-common
+++ b/test/Parser/cuf-sanity-common
@@ -43,6 +43,7 @@
     call globalsub<<<1, 2>>>
     call globalsub<<<1, 2, 3>>>
     call globalsub<<<1, 2, 3, 4>>>
+    call globalsub<<<1, 2, 0.9*10, 4>>>
     call globalsub<<<*,5>>>
     allocate(pa(32), pinned = isPinned)
   end subroutine
diff --git a/test/Parser/cuf-sanity-tree.CUF b/test/Parser/cuf-sanity-tree.CUF
index 83d7540..b4d53f2 100644
--- a/test/Parser/cuf-sanity-tree.CUF
+++ b/test/Parser/cuf-sanity-tree.CUF
@@ -178,7 +178,7 @@
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
 !CHECK: | | | | | | Scalar -> Expr = '2_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '2'
-!CHECK: | | | | | | Scalar -> Integer -> Expr = '3_4'
+!CHECK: | | | | | | Scalar -> Expr = '3_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '3'
 !CHECK: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> CallStmt = 'CALL globalsub<<<1_4,2_4,3_4,4_4>>>()'
 !CHECK: | | | | | Call
@@ -188,10 +188,27 @@
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
 !CHECK: | | | | | | Scalar -> Expr = '2_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '2'
-!CHECK: | | | | | | Scalar -> Integer -> Expr = '3_4'
+!CHECK: | | | | | | Scalar -> Expr = '3_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '3'
 !CHECK: | | | | | | Scalar -> Integer -> Expr = '4_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '4'
+!CHECK: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> CallStmt = 'CALL globalsub<<<1_4,2_4,9._4,4_4>>>()'
+!CHECK: | | | | | Call
+!CHECK: | | | | | | ProcedureDesignator -> Name = 'globalsub'
+!CHECK: | | | | | Chevrons
+!CHECK: | | | | | | StarOrExpr -> Scalar -> Expr = '1_4'
+!CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
+!CHECK: | | | | | | Scalar -> Expr = '2_4'
+!CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '2'
+!CHECK: | | | | | | Scalar -> Expr = '9._4'
+!CHECK: | | | | | | | Multiply
+!CHECK: | | | | | | | | Expr = '8.9999997615814208984375e-1_4'
+!CHECK: | | | | | | | | | LiteralConstant -> RealLiteralConstant
+!CHECK: | | | | | | | | | | Real = '0.9'
+!CHECK: | | | | | | | | Expr = '10_4'
+!CHECK: | | | | | | | | | LiteralConstant -> IntLiteralConstant = '10'
+!CHECK: | | | | | | Scalar -> Integer -> Expr = '4_4'
+!CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '4'
 !CHECK: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt
 !CHECK: | | | | | Allocation
 !CHECK: | | | | | | AllocateObject = 'pa'