[OpenMP][libomptarget] Fix union.

Summary: To make the two parts of the union have the same size, the size of vect needs to be increased by 16 bits.

Reviewers: grokos, carlo.bertolli, caomhin, ABataev

Reviewed By: grokos, ABataev

Subscribers: fedor.sergeev, guansong, openmp-commits

Differential Revision: https://reviews.llvm.org/D44254

git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@327040 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index 3ee32f9..9ceebfc 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -107,27 +107,27 @@
   // methods for flags
   INLINE omp_sched_t GetRuntimeSched();
   INLINE void SetRuntimeSched(omp_sched_t sched);
-  INLINE int IsDynamic() { return data.items.flags & TaskDescr_IsDynamic; }
+  INLINE int IsDynamic() { return items.flags & TaskDescr_IsDynamic; }
   INLINE void SetDynamic() {
-    data.items.flags = data.items.flags | TaskDescr_IsDynamic;
+    items.flags = items.flags | TaskDescr_IsDynamic;
   }
   INLINE void ClearDynamic() {
-    data.items.flags = data.items.flags & (~TaskDescr_IsDynamic);
+    items.flags = items.flags & (~TaskDescr_IsDynamic);
   }
-  INLINE int InParallelRegion() { return data.items.flags & TaskDescr_InPar; }
+  INLINE int InParallelRegion() { return items.flags & TaskDescr_InPar; }
   INLINE int InL2OrHigherParallelRegion() {
-    return data.items.flags & TaskDescr_InParL2P;
+    return items.flags & TaskDescr_InParL2P;
   }
   INLINE int IsParallelConstruct() {
-    return data.items.flags & TaskDescr_IsParConstr;
+    return items.flags & TaskDescr_IsParConstr;
   }
   INLINE int IsTaskConstruct() { return !IsParallelConstruct(); }
   // methods for other fields
-  INLINE uint16_t &NThreads() { return data.items.nthreads; }
-  INLINE uint16_t &ThreadLimit() { return data.items.threadlimit; }
-  INLINE uint16_t &ThreadId() { return data.items.threadId; }
-  INLINE uint16_t &ThreadsInTeam() { return data.items.threadsInTeam; }
-  INLINE uint64_t &RuntimeChunkSize() { return data.items.runtimeChunkSize; }
+  INLINE uint16_t &NThreads() { return items.nthreads; }
+  INLINE uint16_t &ThreadLimit() { return items.threadlimit; }
+  INLINE uint16_t &ThreadId() { return items.threadId; }
+  INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; }
+  INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
   INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() { return prev; }
   INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
     prev = taskDescr;
@@ -160,18 +160,15 @@
   static const uint8_t TaskDescr_IsParConstr = 0x20;
   static const uint8_t TaskDescr_InParL2P = 0x40;
 
-  union { // both have same size
-    uint64_t vect[2];
-    struct TaskDescr_items {
-      uint8_t flags; // 6 bit used (see flag above)
-      uint8_t unused;
-      uint16_t nthreads;         // thread num for subsequent parallel regions
-      uint16_t threadlimit;      // thread limit ICV
-      uint16_t threadId;         // thread id
-      uint16_t threadsInTeam;    // threads in current team
-      uint64_t runtimeChunkSize; // runtime chunk size
-    } items;
-  } data;
+  struct TaskDescr_items {
+    uint8_t flags; // 6 bit used (see flag above)
+    uint8_t unused;
+    uint16_t nthreads;         // thread num for subsequent parallel regions
+    uint16_t threadlimit;      // thread limit ICV
+    uint16_t threadId;         // thread id
+    uint16_t threadsInTeam;    // threads in current team
+    uint64_t runtimeChunkSize; // runtime chunk size
+  } items;
   omptarget_nvptx_TaskDescr *prev;
 };
 
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
index 7c786b7..435a034 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
@@ -18,7 +18,7 @@
 
 INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() {
   // sched starts from 1..4; encode it as 0..3; so add 1 here
-  uint8_t rc = (data.items.flags & TaskDescr_SchedMask) + 1;
+  uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1;
   return (omp_sched_t)rc;
 }
 
@@ -26,9 +26,9 @@
   // sched starts from 1..4; encode it as 0..3; so sub 1 here
   uint8_t val = ((uint8_t)sched) - 1;
   // clear current sched
-  data.items.flags &= ~TaskDescr_SchedMask;
+  items.flags &= ~TaskDescr_SchedMask;
   // set new sched
-  data.items.flags |= val;
+  items.flags |= val;
 }
 
 INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
@@ -38,12 +38,12 @@
   //   dyn is off (unused now anyway, but may need to sample from host ?)
   //   not in parallel
 
-  data.items.flags = 0;
-  data.items.nthreads = GetNumberOfProcsInTeam();
+  items.flags = 0;
+  items.nthreads = GetNumberOfProcsInTeam();
   ;                                // threads: whatever was alloc by kernel
-  data.items.threadId = 0;         // is master
-  data.items.threadsInTeam = 1;    // sequential
-  data.items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
+  items.threadId = 0;         // is master
+  items.threadsInTeam = 1;    // sequential
+  items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
 }
 
 // This is called when all threads are started together in SPMD mode.
@@ -56,20 +56,19 @@
   //   dyn is off (unused now anyway, but may need to sample from host ?)
   //   in L1 parallel
 
-  data.items.flags =
+  items.flags =
       TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
-  data.items.nthreads = 0; // # threads for subsequent parallel region
-  data.items.threadId =
+  items.nthreads = 0; // # threads for subsequent parallel region
+  items.threadId =
       GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
-  data.items.threadsInTeam = tnum;
-  data.items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
+  items.threadsInTeam = tnum;
+  items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
   prev = parentTaskDescr;
 }
 
 INLINE void omptarget_nvptx_TaskDescr::CopyData(
     omptarget_nvptx_TaskDescr *sourceTaskDescr) {
-  data.vect[0] = sourceTaskDescr->data.vect[0];
-  data.vect[1] = sourceTaskDescr->data.vect[1];
+  items = sourceTaskDescr->items;
 }
 
 INLINE void
@@ -87,7 +86,7 @@
 INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
     omptarget_nvptx_TaskDescr *parentTaskDescr) {
   CopyParent(parentTaskDescr);
-  data.items.flags = data.items.flags & ~TaskDescr_IsParConstr;
+  items.flags = items.flags & ~TaskDescr_IsParConstr;
   ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task");
 }
 
@@ -95,9 +94,9 @@
     omptarget_nvptx_TaskDescr *masterTaskDescr, uint16_t tnum) {
   CopyParent(masterTaskDescr);
   // overrwrite specific items;
-  data.items.flags |=
+  items.flags |=
       TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
-  data.items.threadsInTeam = tnum;             // set number of threads
+  items.threadsInTeam = tnum;             // set number of threads
 }
 
 INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
@@ -114,16 +113,16 @@
   // never enters this region.  When a parallel region is executed serially,
   // the threadId is set to 0 elsewhere and the kmpc_serialized_* functions
   // are called, which never activate this region.
-  data.items.threadId =
+  items.threadId =
       GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
 }
 
 INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
     omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) {
   CopyParent(parentTaskDescr);
-  data.items.flags |= TaskDescr_InParL2P; // In L2+ parallelism
-  data.items.threadsInTeam = tnum;        // set number of threads
-  data.items.threadId = tid;
+  items.flags |= TaskDescr_InParL2P; // In L2+ parallelism
+  items.threadsInTeam = tnum;        // set number of threads
+  items.threadId = tid;
 }
 
 ////////////////////////////////////////////////////////////////////////////////