[OPENMP][AMDGCN] Improvements to print_kernel_trace (bit mask)

allow bit masking to select various trace features.
  bit 0 => Launch tracing           (stderr)
  bit 1 => timing of runtime        (stdout)
  bit 2 => detailed launch tracing  (stderr)
  bit 3 => timing goes to stdout instead of stderr

  example: LIBOMPTARGET_KERNEL_TRACE=7     does it all
           LIBOMPTARGET_KERNEL_TRACE=5     Launch + details
           LIBOMPTARGET_KERNEL_TRACE=2     timings + launch to stderr
           LIBOMPTARGET_KERNEL_TRACE=10    timings + launch to stdout

Differential Revision: https://reviews.llvm.org/D96998

GitOrigin-RevId: 30c0d5b4c3f89efef8f79c47fd892d06432d87b1
diff --git a/libomptarget/plugins/amdgpu/src/print_tracing.h b/libomptarget/plugins/amdgpu/src/print_tracing.h
new file mode 100644
index 0000000..624b1fa
--- /dev/null
+++ b/libomptarget/plugins/amdgpu/src/print_tracing.h
@@ -0,0 +1,21 @@
+//===--- print_tracing.h - OpenMP interface definitions -------- AMDGPU -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBOMPTARGET_PLUGINS_AMGGPU_SRC_PRINT_TRACING_H_INCLUDED
+#define LIBOMPTARGET_PLUGINS_AMGGPU_SRC_PRINT_TRACING_H_INCLUDED
+
+enum PrintTraceControlBits {
+  LAUNCH = 1,          // print a message to stderr for each kernel launch
+  RTL_TIMING = 2,      // Print timing info around each RTL step
+  STARTUP_DETAILS = 4, // Details around loading up kernel
+  RTL_TO_STDOUT = 8    // Redirect RTL tracing to stdout
+};
+
+extern int print_kernel_trace; // set by environment variable
+
+#endif
diff --git a/libomptarget/plugins/amdgpu/src/rtl.cpp b/libomptarget/plugins/amdgpu/src/rtl.cpp
index ec79239..382fe5a 100644
--- a/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -38,6 +38,7 @@
 #include "Debug.h"
 #include "get_elf_mach_gfx_name.h"
 #include "omptargetplugin.h"
+#include "print_tracing.h"
 
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
 
@@ -714,7 +715,7 @@
     DeviceInfo.GPUName[device_id] = GetInfoName;
   }
 
-  if (print_kernel_trace == 4)
+  if (print_kernel_trace & STARTUP_DETAILS)
     fprintf(stderr, "Device#%-2d CU's: %2d %s\n", device_id,
             DeviceInfo.ComputeUnits[device_id],
             DeviceInfo.GPUName[device_id].c_str());
@@ -1568,7 +1569,7 @@
   if (Max_Teams > DeviceInfo.HardTeamLimit)
     Max_Teams = DeviceInfo.HardTeamLimit;
 
-  if (print_kernel_trace == 4) {
+  if (print_kernel_trace & STARTUP_DETAILS) {
     fprintf(stderr, "RTLDeviceInfoTy::Max_Teams: %d\n",
             RTLDeviceInfoTy::Max_Teams);
     fprintf(stderr, "Max_Teams: %d\n", Max_Teams);
@@ -1601,7 +1602,7 @@
     DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",
        threadsPerGroup);
   }
-  if (print_kernel_trace == 4)
+  if (print_kernel_trace & STARTUP_DETAILS)
     fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup);
   DP("Preparing %d threads\n", threadsPerGroup);
 
@@ -1614,7 +1615,7 @@
     num_groups = Max_Teams;
   DP("Set default num of groups %d\n", num_groups);
 
-  if (print_kernel_trace == 4) {
+  if (print_kernel_trace & STARTUP_DETAILS) {
     fprintf(stderr, "num_groups: %d\n", num_groups);
     fprintf(stderr, "num_teams: %d\n", num_teams);
   }
@@ -1634,7 +1635,7 @@
   if (num_teams > 0) {
     num_groups = (num_teams < num_groups) ? num_teams : num_groups;
   }
-  if (print_kernel_trace == 4) {
+  if (print_kernel_trace & STARTUP_DETAILS) {
     fprintf(stderr, "num_groups: %d\n", num_groups);
     fprintf(stderr, "DeviceInfo.EnvNumTeams %d\n", DeviceInfo.EnvNumTeams);
     fprintf(stderr, "DeviceInfo.EnvTeamLimit %d\n", DeviceInfo.EnvTeamLimit);
@@ -1667,13 +1668,13 @@
     }
     if (num_groups > Max_Teams) {
       num_groups = Max_Teams;
-      if (print_kernel_trace == 4)
+      if (print_kernel_trace & STARTUP_DETAILS)
         fprintf(stderr, "Limiting num_groups %d to Max_Teams %d \n", num_groups,
                 Max_Teams);
     }
     if (num_groups > num_teams && num_teams > 0) {
       num_groups = num_teams;
-      if (print_kernel_trace == 4)
+      if (print_kernel_trace & STARTUP_DETAILS)
         fprintf(stderr, "Limiting num_groups %d to clause num_teams %d \n",
                 num_groups, num_teams);
     }
@@ -1687,7 +1688,7 @@
         num_groups > DeviceInfo.EnvMaxTeamsDefault)
       num_groups = DeviceInfo.EnvMaxTeamsDefault;
   }
-  if (print_kernel_trace == 4) {
+  if (print_kernel_trace & STARTUP_DETAILS) {
     fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup);
     fprintf(stderr, "num_groups: %d\n", num_groups);
     fprintf(stderr, "loop_tripcount: %ld\n", loop_tripcount);
@@ -1767,14 +1768,17 @@
                 loop_tripcount, // From run_region arg
                 KernelInfo->device_id);
 
-  if (print_kernel_trace >= 1)
+  if (print_kernel_trace >= LAUNCH) {
     // enum modes are SPMD, GENERIC, NONE 0,1,2
-    fprintf(stderr,
+    // if doing rtl timing, print to stderr, unless stdout requested.
+    bool traceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING);
+    fprintf(traceToStdout ? stdout : stderr,
             "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
             "reqd:(%4dX%4d) n:%s\n",
             device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
             arg_num, num_groups, threadsPerGroup, num_teams, thread_limit,
             KernelInfo->Name);
+  }
 
   // Run on the device.
   {