[flang][runtime] Further work on speeding up work queue operations (#149189)

This patch avoids a trip through the work queue engine for cases on a
CPU where finalization and destruction actions during assignment were
handled without enqueueing another task.
diff --git a/flang-rt/lib/runtime/assign.cpp b/flang-rt/lib/runtime/assign.cpp
index d642ed5..7cf4147 100644
--- a/flang-rt/lib/runtime/assign.cpp
+++ b/flang-rt/lib/runtime/assign.cpp
@@ -279,13 +279,15 @@
     if (mustDeallocateLHS) {
       // Convert the LHS into a temporary, then make it look deallocated.
       toDeallocate_ = &tempDescriptor_.descriptor();
-      persist_ = true; // tempDescriptor_ state must outlive child tickets
       std::memcpy(
           reinterpret_cast<void *>(toDeallocate_), &to_, to_.SizeInBytes());
       to_.set_base_addr(nullptr);
       if (toDerived_ && (flags_ & NeedFinalization)) {
-        if (int status{workQueue.BeginFinalize(*toDeallocate_, *toDerived_)};
-            status != StatOk && status != StatContinue) {
+        int status{workQueue.BeginFinalize(*toDeallocate_, *toDerived_)};
+        if (status == StatContinue) {
+          // tempDescriptor_ state must outlive pending child ticket
+          persist_ = true;
+        } else if (status != StatOk) {
           return status;
         }
         flags_ &= ~NeedFinalization;
@@ -304,6 +306,9 @@
       if (int stat{ReturnError(
               workQueue.terminator(), newFrom.Allocate(kNoAsyncObject))};
           stat != StatOk) {
+        if (stat == StatContinue) {
+          persist_ = true;
+        }
         return stat;
       }
       if (HasDynamicComponent(*from_)) {
@@ -507,6 +512,7 @@
     }
   }
   if (persist_) {
+    // tempDescriptor_ must outlive pending child ticket(s)
     done_ = true;
     return StatContinue;
   } else {