[flang][cuda] Fix linkage for dynamic shared memory (#161940)

diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index d3af3ba..2ce0d86 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -372,6 +372,8 @@
     return createCommonLinkage(getContext());
   }
 
+  mlir::StringAttr createExternalLinkage() { return getStringAttr("external"); }
+
   mlir::StringAttr createInternalLinkage() { return getStringAttr("internal"); }
 
   mlir::StringAttr createLinkOnceLinkage() { return getStringAttr("linkonce"); }
diff --git a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
index 6e04c71..09126e0 100644
--- a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
@@ -143,7 +143,11 @@
       auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty);
       std::string sharedMemGlobalName =
           (funcOp.getName() + llvm::Twine(cudaSharedMemSuffix)).str();
-      mlir::StringAttr linkage = builder.createInternalLinkage();
+      // Dynamic shared memory needs an external linkage while static shared
+      // memory needs an internal linkage.
+      mlir::StringAttr linkage = nbDynamicSharedVariables > 0
+                                     ? builder.createExternalLinkage()
+                                     : builder.createInternalLinkage();
       builder.setInsertionPointToEnd(gpuMod.getBody());
       llvm::SmallVector<mlir::NamedAttribute> attrs;
       auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
diff --git a/flang/test/Fir/CUDA/cuda-shared-offset.mlir b/flang/test/Fir/CUDA/cuda-shared-offset.mlir
index 29316c9..9c057d0 100644
--- a/flang/test/Fir/CUDA/cuda-shared-offset.mlir
+++ b/flang/test/Fir/CUDA/cuda-shared-offset.mlir
@@ -17,7 +17,7 @@
 // CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %c-1 : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>       
 // CHECK: gpu.return
 // CHECK: }
-// CHECK: fir.global internal @_QPdynshared__shared_mem {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
+// CHECK: fir.global external @_QPdynshared__shared_mem {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
 
 // -----
 
@@ -158,3 +158,5 @@
 // CHECK-LABEL: gpu.func @_QMmtestsPtestany
 // CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf64>, %c-1{{.*}} : index {bindc_name = "dmasks", uniq_name = "_QMmtestsFtestanyEdmasks"} -> !fir.ref<!fir.array<?xf64>>
 // CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %c-1{{.*}} : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>>
+
+// CHECK: fir.global external @_QMmtestsPtestany__shared_mem {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>