[flang][cuda] Implicitly load cudadevice in host,device and grid_global procedures (#134905)

diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 50ca58e..74367b5 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -4343,7 +4343,9 @@
     }
     if (auto attrs{subp->cudaSubprogramAttrs()}) {
       if (*attrs == common::CUDASubprogramAttrs::Global ||
-          *attrs == common::CUDASubprogramAttrs::Device) {
+          *attrs == common::CUDASubprogramAttrs::Grid_Global ||
+          *attrs == common::CUDASubprogramAttrs::Device ||
+          *attrs == common::CUDASubprogramAttrs::HostDevice) {
         const Scope &scope{currScope()};
         const Scope *mod{FindModuleContaining(scope)};
         if (mod &&
diff --git a/flang/test/Semantics/cuf-device-procedures02.cuf b/flang/test/Semantics/cuf-device-procedures02.cuf
index c93fc40..5945abb 100644
--- a/flang/test/Semantics/cuf-device-procedures02.cuf
+++ b/flang/test/Semantics/cuf-device-procedures02.cuf
@@ -13,5 +13,17 @@
 !ERROR: 'threadfence' is use-associated from module 'cudadevice' and cannot be re-declared
     integer :: threadfence
   end subroutine
+
+  attributes(host,device) subroutine sub3()
+    if (on_device()) then
+      print*, 'on device'
+    else
+      print*, 'on host'
+    end if
+  end subroutine
+
+  attributes(grid_global) subroutine sub4()
+    call threadfence()
+  end subroutine
 end module