[flang][cuda] Introduce stream cast op (#136050)

Cast a stream object reference as a GPU async token. This is useful to
be able to connect the stream representation of CUDA Fortran and the
async mechanism of the GPU dialect.
This op will later become a no op.
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index feef548..f55f3e8 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -18,6 +18,7 @@
 include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.td"
 include "flang/Optimizer/Dialect/FIRTypes.td"
 include "flang/Optimizer/Dialect/FIRAttr.td"
+include "mlir/Dialect/GPU/IR/GPUBase.td"
 include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/IR/BuiltinAttributes.td"
@@ -370,4 +371,25 @@
       CArg<"llvm::ArrayRef<mlir::NamedAttribute>", "{}">:$attributes)>];
 }
 
+def cuf_StreamCastOp : cuf_Op<"stream_cast", [NoMemoryEffect]> {
+  let summary = "Adapt a stream value to a GPU async token";
+
+  let description = [{
+    Cast a stream object reference as a GPU async token. This is useful to be
+    able to connect the stream representation of CUDA Fortran and the async
+    mechanism of the GPU dialect.
+    Later in the lowering this will become a no op.
+  }];
+
+  let arguments = (ins fir_ReferenceType:$stream);
+
+  let results = (outs GPU_AsyncToken:$token);
+
+  let assemblyFormat = [{
+    $stream attr-dict `:` type($stream)
+  }];
+
+  let hasVerifier = 1;
+}
+
 #endif // FORTRAN_DIALECT_CUF_CUF_OPS
diff --git a/flang/include/flang/Optimizer/Support/InitFIR.h b/flang/include/flang/Optimizer/Support/InitFIR.h
index 4c57e01..1868fbb 100644
--- a/flang/include/flang/Optimizer/Support/InitFIR.h
+++ b/flang/include/flang/Optimizer/Support/InitFIR.h
@@ -40,7 +40,7 @@
       mlir::cf::ControlFlowDialect, mlir::func::FuncDialect,                   \
       mlir::vector::VectorDialect, mlir::math::MathDialect,                    \
       mlir::complex::ComplexDialect, mlir::DLTIDialect, cuf::CUFDialect,       \
-      mlir::NVVM::NVVMDialect
+      mlir::NVVM::NVVMDialect, mlir::gpu::GPUDialect
 
 #define FLANG_CODEGEN_DIALECT_LIST FIRCodeGenDialect, mlir::LLVM::LLVMDialect
 
diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
index 957e4c0..ce197d4 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@@ -319,6 +319,17 @@
   result.addAttributes(attributes);
 }
 
+//===----------------------------------------------------------------------===//
+// StreamCastOp
+//===----------------------------------------------------------------------===//
+
+llvm::LogicalResult cuf::StreamCastOp::verify() {
+  auto refTy = mlir::dyn_cast<fir::ReferenceType>(getStream().getType());
+  if (!refTy.getEleTy().isInteger(64))
+    return emitOpError("stream is expected to be a i64 reference");
+  return mlir::success();
+}
+
 // Tablegen operators
 
 #define GET_OP_CLASSES
diff --git a/flang/test/Fir/CUDA/cuda-stream.mlir b/flang/test/Fir/CUDA/cuda-stream.mlir
new file mode 100644
index 0000000..50f2304
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-stream.mlir
@@ -0,0 +1,21 @@
+// RUN: fir-opt --split-input-file %s | FileCheck %s
+
+module attributes {gpu.container_module} {
+  gpu.module @cuda_device_mod {
+    gpu.func @_QMmod1Psub1() kernel {
+      gpu.return
+    }
+  }
+  func.func @_QMmod1Phost_sub() {
+    %0 = fir.alloca i64
+    %1 = arith.constant 1 : index
+    %asyncTok = cuf.stream_cast %0 : !fir.ref<i64>
+    gpu.launch_func [%asyncTok] @cuda_device_mod::@_QMmod1Psub1 blocks in (%1, %1, %1) threads in (%1, %1, %1) args() {cuf.proc_attr = #cuf.cuda_proc<grid_global>}
+    return
+  }
+}
+
+// CHECK-LABEL: func.func @_QMmod1Phost_sub()
+// CHECK: %[[STREAM:.*]] = fir.alloca i64
+// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[STREAM]] : <i64>
+// CHECK: gpu.launch_func [%[[TOKEN]]] @cuda_device_mod::@_QMmod1Psub1
diff --git a/flang/tools/fir-opt/fir-opt.cpp b/flang/tools/fir-opt/fir-opt.cpp
index ef510ff..d66fc3f 100644
--- a/flang/tools/fir-opt/fir-opt.cpp
+++ b/flang/tools/fir-opt/fir-opt.cpp
@@ -44,8 +44,6 @@
 #endif
   DialectRegistry registry;
   fir::support::registerDialects(registry);
-  registry.insert<mlir::gpu::GPUDialect>();
-  registry.insert<mlir::NVVM::NVVMDialect>();
   fir::support::addFIRExtensions(registry);
   return failed(MlirOptMain(argc, argv, "FIR modular optimizer driver\n",
       registry));