[OpenMP] libomp: minor changes to improve library performance

Three minor changes in this patch:
- added UNLIKELY hint to few rarely executed branches;
- replaced couple of run time checks with debug assertions;
- moved check of presence of ittnotify tool from inside the function call.

Differential Revision: https://reviews.llvm.org/D95816

GitOrigin-RevId: 838dcdb5fc428fd4b1830a1a23bb38c1fb1c8e5a
diff --git a/runtime/src/kmp_itt.h b/runtime/src/kmp_itt.h
index b14a193..6b64cd4 100644
--- a/runtime/src/kmp_itt.h
+++ b/runtime/src/kmp_itt.h
@@ -90,6 +90,16 @@
 __kmp_inline void *__kmp_itt_taskwait_object(int gtid);
 __kmp_inline void __kmp_itt_taskwait_starting(int gtid, void *object);
 __kmp_inline void __kmp_itt_taskwait_finished(int gtid, void *object);
+#define KMP_ITT_TASKWAIT_STARTING(obj) \
+    if (UNLIKELY(__itt_sync_create_ptr)) { \
+      obj = __kmp_itt_taskwait_object(gtid); \
+      if (obj != NULL) { \
+        __kmp_itt_taskwait_starting(gtid, obj); \
+      } \
+    }
+#define KMP_ITT_TASKWAIT_FINISHED(obj) \
+    if (UNLIKELY(obj != NULL)) \
+      __kmp_itt_taskwait_finished(gtid, obj);
 
 // --- Task reporting ---
 __kmp_inline void __kmp_itt_task_starting(void *object);
diff --git a/runtime/src/kmp_tasking.cpp b/runtime/src/kmp_tasking.cpp
index b680767..876cd25 100644
--- a/runtime/src/kmp_tasking.cpp
+++ b/runtime/src/kmp_tasking.cpp
@@ -620,7 +620,7 @@
                 "current_task=%p\n",
                 gtid, loc_ref, taskdata, current_task));
 
-  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
+  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
     // untied task needs to increment counter so that the task structure is not
     // freed prematurely
     kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
@@ -883,7 +883,7 @@
      hence overlapping the destructor invocations with some other work in the
      released tasks.  The OpenMP spec is not specific on when the destructors
      are invoked, so we should be free to choose. */
-  if (taskdata->td_flags.destructors_thunk) {
+  if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
     kmp_routine_entry_t destr_thunk = task->data1.destructors;
     KMP_ASSERT(destr_thunk);
     destr_thunk(gtid, task);
@@ -894,7 +894,7 @@
   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
 
   bool detach = false;
-  if (taskdata->td_flags.detachable == TASK_DETACHABLE) {
+  if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
     if (taskdata->td_allow_completion_event.type ==
         KMP_EVENT_ALLOW_COMPLETION) {
       // event hasn't been fulfilled yet. Try to detach task.
@@ -987,7 +987,7 @@
                                                   kmp_task_t *task) {
   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
-  __kmp_assert_valid_gtid(gtid);
+  KMP_DEBUG_ASSERT(gtid >= 0);
   // this routine will provide task to resume
   __kmp_task_finish<ompt>(gtid, task, NULL);
 
@@ -1234,8 +1234,8 @@
   // Detachable tasks are not proxy tasks yet but could be in the future. Doing
   // the tasking setup
   // when that happens is too late.
-  if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE ||
-      flags->hidden_helper) {
+  if (UNLIKELY(flags->proxy == TASK_PROXY ||
+               flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
     if (flags->proxy == TASK_PROXY) {
       flags->tiedness = TASK_UNTIED;
       flags->merged_if0 = 1;
@@ -1875,7 +1875,7 @@
   KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
 
   KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
-  __kmp_assert_valid_gtid(gtid);
+  KMP_DEBUG_ASSERT(gtid >= 0);
 
   if (__kmp_tasking_mode != tskm_immediate_exec) {
     thread = __kmp_threads[gtid];
@@ -1915,9 +1915,10 @@
     taskdata->td_taskwait_thread = gtid + 1;
 
 #if USE_ITT_BUILD
-    void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
-    if (UNLIKELY(itt_sync_obj != NULL))
-      __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
+    void *itt_sync_obj = NULL;
+#if USE_ITT_NOTIFY
+    KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
+#endif /* USE_ITT_NOTIFY */
 #endif /* USE_ITT_BUILD */
 
     bool must_wait =
@@ -1943,8 +1944,7 @@
       }
     }
 #if USE_ITT_BUILD
-    if (UNLIKELY(itt_sync_obj != NULL))
-      __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
+    KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
     KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
 #endif /* USE_ITT_BUILD */
 
@@ -2028,9 +2028,10 @@
     taskdata->td_taskwait_thread = gtid + 1;
 
 #if USE_ITT_BUILD
-    void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
-    if (UNLIKELY(itt_sync_obj != NULL))
-      __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
+    void *itt_sync_obj = NULL;
+#if USE_ITT_NOTIFY
+    KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
+#endif /* USE_ITT_NOTIFY */
 #endif /* USE_ITT_BUILD */
     if (!taskdata->td_flags.team_serial) {
       kmp_task_team_t *task_team = thread->th.th_task_team;
@@ -2052,8 +2053,7 @@
       }
     }
 #if USE_ITT_BUILD
-    if (UNLIKELY(itt_sync_obj != NULL))
-      __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
+    KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
 #endif /* USE_ITT_BUILD */
 
     // Debugger:  The taskwait is completed. Location remains, but thread is
@@ -2553,9 +2553,10 @@
 #if USE_ITT_BUILD
     // For ITT the taskgroup wait is similar to taskwait until we need to
     // distinguish them
-    void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
-    if (UNLIKELY(itt_sync_obj != NULL))
-      __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
+    void *itt_sync_obj = NULL;
+#if USE_ITT_NOTIFY
+    KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
+#endif /* USE_ITT_NOTIFY */
 #endif /* USE_ITT_BUILD */
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
@@ -2588,8 +2589,7 @@
 #endif
 
 #if USE_ITT_BUILD
-    if (UNLIKELY(itt_sync_obj != NULL))
-      __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
+    KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
     KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
 #endif /* USE_ITT_BUILD */
   }