[flang] [cuda] implicitly set DEVICE attribute to scalars in device routines (#140834)

Scalars inside device routines also need to implicitly set the DEVICE
attribute, except for function results.
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 92a3277..3f4a064 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -9376,7 +9376,7 @@
   if (inDeviceSubprogram && symbol.has<ObjectEntityDetails>()) {
     auto *object{symbol.detailsIf<ObjectEntityDetails>()};
     if (!object->cudaDataAttr() && !IsValue(symbol) &&
-        (IsDummy(symbol) || object->IsArray())) {
+        !IsFunctionResult(symbol)) {
       // Implicitly set device attribute if none is set in device context.
       object->set_cudaDataAttr(common::CUDADataAttr::Device);
     }
diff --git a/flang/test/Lower/CUDA/cuda-shared.cuf b/flang/test/Lower/CUDA/cuda-shared.cuf
index f41011d..565857f 100644
--- a/flang/test/Lower/CUDA/cuda-shared.cuf
+++ b/flang/test/Lower/CUDA/cuda-shared.cuf
@@ -9,4 +9,5 @@
 
 ! CHECK-LABEL: func.func @_QPsharedmem() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
 ! CHECK: %{{.*}} = cuf.shared_memory !fir.array<32xf32> {bindc_name = "s", uniq_name = "_QFsharedmemEs"} -> !fir.ref<!fir.array<32xf32>>
+! CHECK: cuf.free %{{.*}}#0 : !fir.ref<i32> {data_attr = #cuf.cuda<device>}
 ! CHECK-NOT: cuf.free
diff --git a/flang/test/Semantics/cuf21.cuf b/flang/test/Semantics/cuf21.cuf
new file mode 100644
index 0000000..b8b99a8
--- /dev/null
+++ b/flang/test/Semantics/cuf21.cuf
@@ -0,0 +1,27 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+! Test generic matching with scalars argument without device attr
+
+module mlocModule
+  interface maxlocUpdate
+     module procedure :: &
+          maxlocUpdateR_32F, &
+          maxlocUpdateR_64F
+  end interface maxlocUpdate
+contains
+
+  attributes(global) subroutine maxlocPartialMaskR_32F1D()
+    implicit none
+    real(4) :: mval
+
+    call maxlocUpdate(mval)
+
+  end subroutine maxlocPartialMaskR_32F1D
+
+  attributes(device) subroutine maxlocUpdateR_32F(mval)
+    real(4) :: mval
+  end subroutine maxlocUpdateR_32F
+
+  attributes(device) subroutine maxlocUpdateR_64F(mval)
+    real(8) :: mval
+  end subroutine maxlocUpdateR_64F
+end module